17
17
"""Compiled extensions for doing compression."""
19
from __future__ import absolute_import
22
20
cdef extern from "python-compat.h":
25
from libc.stdlib cimport (
28
from libc.string cimport (
32
from cpython.bytes cimport (
35
PyBytes_FromStringAndSize,
38
from cpython.object cimport (
41
from cpython.mem cimport (
24
cdef extern from "Python.h":
25
ctypedef int Py_ssize_t # Required for older pyrex versions
26
int PyString_CheckExact(object)
27
char * PyString_AS_STRING(object)
28
Py_ssize_t PyString_GET_SIZE(object)
29
object PyString_FromStringAndSize(char *, Py_ssize_t)
33
ctypedef unsigned long size_t
34
void * malloc(size_t) nogil
35
void * realloc(void *, size_t) nogil
36
void free(void *) nogil
37
void memcpy(void *, void *, size_t) nogil
47
40
cdef extern from "delta.h":
51
44
unsigned long agg_offset
52
45
struct delta_index:
54
ctypedef enum delta_result:
62
delta_result create_delta_index(source_info *src,
65
int max_entries) nogil
66
delta_result create_delta_index_from_delta(source_info *delta,
68
delta_index **fresh) nogil
47
delta_index * create_delta_index(source_info *src, delta_index *old) nogil
48
delta_index * create_delta_index_from_delta(source_info *delta,
49
delta_index *old) nogil
69
50
void free_delta_index(delta_index *index) nogil
70
delta_result create_delta(delta_index *indexes,
71
void *buf, unsigned long bufsize,
72
unsigned long *delta_size,
73
unsigned long max_delta_size,
74
void **delta_data) nogil
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size) nogil
75
54
unsigned long get_delta_hdr_size(unsigned char **datap,
76
55
unsigned char *top) nogil
77
unsigned long sizeof_delta_index(delta_index *index)
78
56
Py_ssize_t DELTA_SIZE_MIN
79
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
80
int get_entry_summary(delta_index *index, int pos,
81
unsigned int *global_offset, unsigned int *hash_val)
82
unsigned int rabin_hash (unsigned char *data)
59
cdef void *safe_malloc(size_t count) except NULL:
61
result = malloc(count)
63
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
cdef void *safe_realloc(void * old, size_t count) except NULL:
69
result = realloc(old, count)
71
raise MemoryError('Failed to reallocate to %d bytes of memory'
76
cdef int safe_free(void **val) except -1:
85
82
def make_delta_index(source):
86
83
return DeltaIndex(source)
89
cdef object _translate_delta_failure(delta_result result):
90
if result == DELTA_OUT_OF_MEMORY:
91
return MemoryError("Delta function failed to allocate memory")
92
elif result == DELTA_INDEX_NEEDED:
93
return ValueError("Delta function requires delta_index param")
94
elif result == DELTA_SOURCE_EMPTY:
95
return ValueError("Delta function given empty source_info param")
96
elif result == DELTA_SOURCE_BAD:
97
return RuntimeError("Delta function given invalid source_info param")
98
elif result == DELTA_BUFFER_EMPTY:
99
return ValueError("Delta function given empty buffer params")
100
return AssertionError("Unrecognised delta result code: %d" % result)
103
def _rabin_hash(content):
104
if not PyBytes_CheckExact(content):
105
raise ValueError('content must be a string')
106
if len(content) < 16:
107
raise ValueError('content must be at least 16 bytes long')
108
# Try to cast it to an int, if it can fit
109
return int(rabin_hash(<unsigned char*>(PyBytes_AS_STRING(content))))
112
86
cdef class DeltaIndex:
114
cdef readonly list _sources
88
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
89
# isn't performance critical
90
# cdef readonly list _sources
91
cdef readonly object _sources
115
92
cdef source_info *_source_infos
116
93
cdef delta_index *_index
94
cdef readonly unsigned int _max_num_sources
117
95
cdef public unsigned long _source_offset
118
cdef readonly unsigned int _max_num_sources
119
cdef public int _max_bytes_to_index
121
def __init__(self, source=None, max_bytes_to_index=None):
97
def __init__(self, source=None):
122
98
self._sources = []
123
99
self._index = NULL
124
100
self._max_num_sources = 65000
125
self._source_infos = <source_info *>PyMem_Malloc(
126
sizeof(source_info) * self._max_num_sources)
127
if self._source_infos == NULL:
128
raise MemoryError('failed to allocate memory for DeltaIndex')
101
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
102
* self._max_num_sources)
129
103
self._source_offset = 0
130
self._max_bytes_to_index = 0
131
if max_bytes_to_index is not None:
132
self._max_bytes_to_index = max_bytes_to_index
134
105
if source is not None:
135
106
self.add_source(source, 0)
137
def __sizeof__(self):
138
# We want to track the _source_infos allocations, but the referenced
139
# void* are actually tracked in _sources itself.
140
return (sizeof(DeltaIndex)
141
+ (sizeof(source_info) * self._max_num_sources)
142
+ sizeof_delta_index(self._index))
144
108
def __repr__(self):
145
109
return '%s(%d, %d)' % (self.__class__.__name__,
146
110
len(self._sources), self._source_offset)
149
113
if self._index != NULL:
150
114
free_delta_index(self._index)
151
115
self._index = NULL
152
PyMem_Free(self._source_infos)
116
safe_free(<void **>&self._source_infos)
154
118
def _has_index(self):
155
119
return (self._index != NULL)
157
def _dump_index(self):
158
"""Dump the pointers in the index.
160
This is an arbitrary layout, used for testing. It is not meant to be
161
used in production code.
163
:return: (hash_list, entry_list)
164
hash_list A list of offsets, so hash[i] points to the 'hash
165
bucket' starting at the given offset and going until
167
entry_list A list of (text_offset, hash_val). text_offset is the
168
offset in the "source" texts, and hash_val is the RABIN
169
hash for that offset.
170
Note that the entry should be in the hash bucket
172
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
175
cdef unsigned int text_offset
176
cdef unsigned int hash_val
177
cdef unsigned int hash_offset
178
if self._index == NULL:
182
while get_hash_offset(self._index, pos, &hash_offset):
183
hash_list.append(int(hash_offset))
187
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
188
# Map back using 'int' so that we don't get Long everywhere, when
189
# almost everything is <2**31.
190
val = tuple(map(int, [text_offset, hash_val]))
191
entry_list.append(val)
193
return hash_list, entry_list
195
121
def add_delta_source(self, delta, unadded_bytes):
196
122
"""Add a new delta to the source texts.
202
128
cdef char *c_delta
203
129
cdef Py_ssize_t c_delta_size
204
130
cdef delta_index *index
205
cdef delta_result res
206
131
cdef unsigned int source_location
207
132
cdef source_info *src
208
133
cdef unsigned int num_indexes
210
if not PyBytes_CheckExact(delta):
211
raise TypeError('delta is not a bytestring')
135
if not PyString_CheckExact(delta):
136
raise TypeError('delta is not a str')
213
138
source_location = len(self._sources)
214
139
if source_location >= self._max_num_sources:
215
140
self._expand_sources()
216
141
self._sources.append(delta)
217
c_delta = PyBytes_AS_STRING(delta)
218
c_delta_size = PyBytes_GET_SIZE(delta)
142
c_delta = PyString_AS_STRING(delta)
143
c_delta_size = PyString_GET_SIZE(delta)
219
144
src = self._source_infos + source_location
220
145
src.buf = c_delta
221
146
src.size = c_delta_size
222
147
src.agg_offset = self._source_offset + unadded_bytes
224
res = create_delta_index_from_delta(src, self._index, &index)
226
raise _translate_delta_failure(res)
149
index = create_delta_index_from_delta(src, self._index)
227
150
self._source_offset = src.agg_offset + src.size
228
if index != self._index:
229
152
free_delta_index(self._index)
230
153
self._index = index
235
158
:param source: The text in question, this must be a byte string
236
159
:param unadded_bytes: Assume there are this many bytes that didn't get
237
160
added between this source and the end of the previous source.
238
:param max_pointers: Add no more than this many entries to the index.
239
By default, we sample every 16 bytes, if that would require more
240
than max_entries, we will reduce the sampling rate.
241
A value of 0 means unlimited, None means use the default limit.
243
162
cdef char *c_source
244
163
cdef Py_ssize_t c_source_size
245
164
cdef delta_index *index
246
cdef delta_result res
247
165
cdef unsigned int source_location
248
166
cdef source_info *src
249
167
cdef unsigned int num_indexes
250
cdef int max_num_entries
252
if not PyBytes_CheckExact(source):
253
raise TypeError('source is not a bytestring')
169
if not PyString_CheckExact(source):
170
raise TypeError('source is not a str')
255
172
source_location = len(self._sources)
256
173
if source_location >= self._max_num_sources:
270
187
# We delay creating the index on the first insert
271
188
if source_location != 0:
273
res = create_delta_index(src, self._index, &index,
274
self._max_bytes_to_index)
276
raise _translate_delta_failure(res)
277
if index != self._index:
190
index = create_delta_index(src, self._index)
278
192
free_delta_index(self._index)
279
193
self._index = index
281
195
cdef _populate_first_index(self):
282
196
cdef delta_index *index
283
cdef delta_result res
284
197
if len(self._sources) != 1 or self._index != NULL:
285
198
raise AssertionError('_populate_first_index should only be'
286
199
' called when we have a single source and no index yet')
288
# We know that self._index is already NULL, so create_delta_index
289
# will always create a new index unless there's a malloc failure
201
# We know that self._index is already NULL, so whatever
202
# create_delta_index returns is fine
291
res = create_delta_index(&self._source_infos[0], NULL, &index,
292
self._max_bytes_to_index)
294
raise _translate_delta_failure(res)
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
297
207
cdef _expand_sources(self):
298
208
raise RuntimeError('if we move self._source_infos, then we need to'
299
209
' change all of the index pointers as well.')
210
self._max_num_sources = self._max_num_sources * 2
211
self._source_infos = <source_info *>safe_realloc(self._source_infos,
213
* self._max_num_sources)
301
215
def make_delta(self, target_bytes, max_delta_size=0):
302
216
"""Create a delta from the current source to the target bytes."""
313
226
# We were just lazy about generating the index
314
227
self._populate_first_index()
316
if not PyBytes_CheckExact(target_bytes):
317
raise TypeError('target is not a bytestring')
229
if not PyString_CheckExact(target_bytes):
230
raise TypeError('target is not a str')
319
target = PyBytes_AS_STRING(target_bytes)
320
target_size = PyBytes_GET_SIZE(target_bytes)
232
target = PyString_AS_STRING(target_bytes)
233
target_size = PyString_GET_SIZE(target_bytes)
322
235
# TODO: inline some of create_delta so we at least don't have to double
323
# malloc, and can instead use PyBytes_FromStringAndSize, to
236
# malloc, and can instead use PyString_FromStringAndSize, to
324
237
# allocate the bytes into the final string
325
238
c_max_delta_size = max_delta_size
327
res = create_delta(self._index, target, target_size,
328
&delta_size, c_max_delta_size, &delta)
240
delta = create_delta(self._index,
242
&delta_size, c_max_delta_size)
331
result = PyBytes_FromStringAndSize(<char *>delta, delta_size)
245
result = PyString_FromStringAndSize(<char *>delta, delta_size)
333
elif res != DELTA_SIZE_TOO_BIG:
334
raise _translate_delta_failure(res)
349
261
cdef Py_ssize_t delta_size
351
if not PyBytes_CheckExact(source_bytes):
352
raise TypeError('source is not a bytestring')
353
if not PyBytes_CheckExact(delta_bytes):
354
raise TypeError('delta is not a bytestring')
355
source = PyBytes_AS_STRING(source_bytes)
356
source_size = PyBytes_GET_SIZE(source_bytes)
357
delta = PyBytes_AS_STRING(delta_bytes)
358
delta_size = PyBytes_GET_SIZE(delta_bytes)
263
if not PyString_CheckExact(source_bytes):
264
raise TypeError('source is not a str')
265
if not PyString_CheckExact(delta_bytes):
266
raise TypeError('delta is not a str')
267
source = PyString_AS_STRING(source_bytes)
268
source_size = PyString_GET_SIZE(source_bytes)
269
delta = PyString_AS_STRING(delta_bytes)
270
delta_size = PyString_GET_SIZE(delta_bytes)
359
271
# Code taken from patch-delta.c, only brought here to give better error
360
272
# handling, and to avoid double allocating memory
361
273
if (delta_size < DELTA_SIZE_MIN):
388
300
count = count + 1
390
off = off | (data[count] << 8)
302
off = off | (bytes[count] << 8)
391
303
count = count + 1
393
off = off | (data[count] << 16)
305
off = off | (bytes[count] << 16)
394
306
count = count + 1
396
off = off | (data[count] << 24)
308
off = off | (bytes[count] << 24)
397
309
count = count + 1
400
312
count = count + 1
402
size = size | (data[count] << 8)
314
size = size | (bytes[count] << 8)
403
315
count = count + 1
405
size = size | (data[count] << 16)
317
size = size | (bytes[count] << 16)
406
318
count = count + 1
414
326
cdef object _apply_delta(char *source, Py_ssize_t source_size,
415
327
char *delta, Py_ssize_t delta_size):
416
328
"""common functionality between apply_delta and apply_delta_to_source."""
417
cdef unsigned char *data
418
cdef unsigned char *top
419
cdef unsigned char *dst_buf
420
cdef unsigned char *out
421
cdef unsigned char cmd
329
cdef unsigned char *data, *top
330
cdef unsigned char *dst_buf, *out, cmd
422
331
cdef Py_ssize_t size
423
332
cdef unsigned int cp_off, cp_size