13
13
# You should have received a copy of the GNU General Public License
14
14
# along with this program; if not, write to the Free Software
15
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
# cython: language_level=3
17
19
"""Compiled extensions for doing compression."""
20
22
cdef extern from "python-compat.h":
24
cdef extern from "Python.h":
25
ctypedef struct PyObject:
27
ctypedef int Py_ssize_t # Required for older pyrex versions
28
int PyString_CheckExact(object)
29
char * PyString_AS_STRING(object)
30
Py_ssize_t PyString_GET_SIZE(object)
31
object PyString_FromStringAndSize(char *, Py_ssize_t)
35
ctypedef unsigned long size_t
36
void * malloc(size_t) nogil
37
void * realloc(void *, size_t) nogil
38
void free(void *) nogil
39
void memcpy(void *, void *, size_t) nogil
25
from libc.stdlib cimport (
28
from libc.string cimport (
32
from cpython.bytes cimport (
35
PyBytes_FromStringAndSize,
38
from cpython.object cimport (
41
from cpython.mem cimport (
42
47
cdef extern from "delta.h":
77
82
unsigned int rabin_hash (unsigned char *data)
80
cdef void *safe_malloc(size_t count) except NULL:
82
result = malloc(count)
84
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
88
cdef void *safe_realloc(void * old, size_t count) except NULL:
90
result = realloc(old, count)
92
raise MemoryError('Failed to reallocate to %d bytes of memory'
97
cdef int safe_free(void **val) except -1:
103
85
def make_delta_index(source):
104
86
return DeltaIndex(source)
121
103
def _rabin_hash(content):
122
if not PyString_CheckExact(content):
104
if not PyBytes_CheckExact(content):
123
105
raise ValueError('content must be a string')
124
106
if len(content) < 16:
125
107
raise ValueError('content must be at least 16 bytes long')
126
108
# Try to cast it to an int, if it can fit
127
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
109
return int(rabin_hash(<unsigned char*>(PyBytes_AS_STRING(content))))
130
112
cdef class DeltaIndex:
132
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
133
# isn't performance critical
134
# cdef readonly list _sources
135
cdef readonly object _sources
114
cdef readonly list _sources
136
115
cdef source_info *_source_infos
137
116
cdef delta_index *_index
138
117
cdef public unsigned long _source_offset
143
122
self._sources = []
144
123
self._index = NULL
145
124
self._max_num_sources = 65000
146
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
147
* self._max_num_sources)
125
self._source_infos = <source_info *>PyMem_Malloc(
126
sizeof(source_info) * self._max_num_sources)
127
if self._source_infos == NULL:
128
raise MemoryError('failed to allocate memory for DeltaIndex')
148
129
self._source_offset = 0
149
130
self._max_bytes_to_index = 0
150
131
if max_bytes_to_index is not None:
156
137
def __sizeof__(self):
157
138
# We want to track the _source_infos allocations, but the referenced
158
139
# void* are actually tracked in _sources itself.
159
# XXX: Cython is capable of doing sizeof(class) and returning the size
160
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
161
# to do it manually. *sigh* Note that we might get it wrong
162
# because of alignment issues.
164
# PyObject start, vtable *, 3 object pointers, 2 C ints
165
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
166
+ sizeof(unsigned long)
167
+ sizeof(unsigned int))
140
return (sizeof(DeltaIndex)
168
141
+ (sizeof(source_info) * self._max_num_sources)
169
142
+ sizeof_delta_index(self._index))
172
144
def __repr__(self):
173
145
return '%s(%d, %d)' % (self.__class__.__name__,
235
207
cdef source_info *src
236
208
cdef unsigned int num_indexes
238
if not PyString_CheckExact(delta):
239
raise TypeError('delta is not a str')
210
if not PyBytes_CheckExact(delta):
211
raise TypeError('delta is not a bytestring')
241
213
source_location = len(self._sources)
242
214
if source_location >= self._max_num_sources:
243
215
self._expand_sources()
244
216
self._sources.append(delta)
245
c_delta = PyString_AS_STRING(delta)
246
c_delta_size = PyString_GET_SIZE(delta)
217
c_delta = PyBytes_AS_STRING(delta)
218
c_delta_size = PyBytes_GET_SIZE(delta)
247
219
src = self._source_infos + source_location
248
220
src.buf = c_delta
249
221
src.size = c_delta_size
277
249
cdef unsigned int num_indexes
278
250
cdef int max_num_entries
280
if not PyString_CheckExact(source):
281
raise TypeError('source is not a str')
252
if not PyBytes_CheckExact(source):
253
raise TypeError('source is not a bytestring')
283
255
source_location = len(self._sources)
284
256
if source_location >= self._max_num_sources:
287
259
# We were lazy about populating the index, create it now
288
260
self._populate_first_index()
289
261
self._sources.append(source)
290
c_source = PyString_AS_STRING(source)
291
c_source_size = PyString_GET_SIZE(source)
262
c_source = PyBytes_AS_STRING(source)
263
c_source_size = PyBytes_GET_SIZE(source)
292
264
src = self._source_infos + source_location
293
265
src.buf = c_source
294
266
src.size = c_source_size
325
297
cdef _expand_sources(self):
326
298
raise RuntimeError('if we move self._source_infos, then we need to'
327
299
' change all of the index pointers as well.')
328
self._max_num_sources = self._max_num_sources * 2
329
self._source_infos = <source_info *>safe_realloc(self._source_infos,
331
* self._max_num_sources)
333
301
def make_delta(self, target_bytes, max_delta_size=0):
334
302
"""Create a delta from the current source to the target bytes."""
345
313
# We were just lazy about generating the index
346
314
self._populate_first_index()
348
if not PyString_CheckExact(target_bytes):
349
raise TypeError('target is not a str')
316
if not PyBytes_CheckExact(target_bytes):
317
raise TypeError('target is not a bytestring')
351
target = PyString_AS_STRING(target_bytes)
352
target_size = PyString_GET_SIZE(target_bytes)
319
target = PyBytes_AS_STRING(target_bytes)
320
target_size = PyBytes_GET_SIZE(target_bytes)
354
322
# TODO: inline some of create_delta so we at least don't have to double
355
# malloc, and can instead use PyString_FromStringAndSize, to
323
# malloc, and can instead use PyBytes_FromStringAndSize, to
356
324
# allocate the bytes into the final string
357
325
c_max_delta_size = max_delta_size
360
328
&delta_size, c_max_delta_size, &delta)
362
330
if res == DELTA_OK:
363
result = PyString_FromStringAndSize(<char *>delta, delta_size)
331
result = PyBytes_FromStringAndSize(<char *>delta, delta_size)
365
333
elif res != DELTA_SIZE_TOO_BIG:
366
334
raise _translate_delta_failure(res)
381
349
cdef Py_ssize_t delta_size
383
if not PyString_CheckExact(source_bytes):
384
raise TypeError('source is not a str')
385
if not PyString_CheckExact(delta_bytes):
386
raise TypeError('delta is not a str')
387
source = PyString_AS_STRING(source_bytes)
388
source_size = PyString_GET_SIZE(source_bytes)
389
delta = PyString_AS_STRING(delta_bytes)
390
delta_size = PyString_GET_SIZE(delta_bytes)
351
if not PyBytes_CheckExact(source_bytes):
352
raise TypeError('source is not a bytestring')
353
if not PyBytes_CheckExact(delta_bytes):
354
raise TypeError('delta is not a bytestring')
355
source = PyBytes_AS_STRING(source_bytes)
356
source_size = PyBytes_GET_SIZE(source_bytes)
357
delta = PyBytes_AS_STRING(delta_bytes)
358
delta_size = PyBytes_GET_SIZE(delta_bytes)
391
359
# Code taken from patch-delta.c, only brought here to give better error
392
360
# handling, and to avoid double allocating memory
393
361
if (delta_size < DELTA_SIZE_MIN):
398
366
return _apply_delta(source, source_size, delta, delta_size)
401
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
369
cdef unsigned char *_decode_copy_instruction(unsigned char *data,
402
370
unsigned char cmd, unsigned int *offset,
403
371
unsigned int *length) nogil: # cannot_raise
404
372
"""Decode a copy instruction from the next few bytes.
420
388
count = count + 1
422
off = off | (bytes[count] << 8)
390
off = off | (data[count] << 8)
423
391
count = count + 1
425
off = off | (bytes[count] << 16)
393
off = off | (data[count] << 16)
426
394
count = count + 1
428
off = off | (bytes[count] << 24)
396
off = off | (data[count] << 24)
429
397
count = count + 1
432
400
count = count + 1
434
size = size | (bytes[count] << 8)
402
size = size | (data[count] << 8)
435
403
count = count + 1
437
size = size | (bytes[count] << 16)
405
size = size | (data[count] << 16)
438
406
count = count + 1
446
414
cdef object _apply_delta(char *source, Py_ssize_t source_size,
447
415
char *delta, Py_ssize_t delta_size):
448
416
"""common functionality between apply_delta and apply_delta_to_source."""
449
cdef unsigned char *data, *top
450
cdef unsigned char *dst_buf, *out, cmd
417
cdef unsigned char *data
418
cdef unsigned char *top
419
cdef unsigned char *dst_buf
420
cdef unsigned char *out
421
cdef unsigned char cmd
451
422
cdef Py_ssize_t size
452
423
cdef unsigned int cp_off, cp_size
458
429
# now the result size
459
430
size = get_delta_hdr_size(&data, top)
460
result = PyString_FromStringAndSize(NULL, size)
461
dst_buf = <unsigned char*>PyString_AS_STRING(result)
431
result = PyBytes_FromStringAndSize(NULL, size)
432
dst_buf = <unsigned char*>PyBytes_AS_STRING(result)
509
480
raise RuntimeError('Did not extract the number of bytes we expected'
510
481
' we were left with %d bytes in "size", and top - data = %d'
511
482
% (size, <int>(top - data)))
514
484
# *dst_size = out - dst_buf;
515
if (out - dst_buf) != PyString_GET_SIZE(result):
485
if (out - dst_buf) != PyBytes_GET_SIZE(result):
516
486
raise RuntimeError('Number of bytes extracted did not match the'
517
487
' size encoded in the delta header.')
526
496
cdef Py_ssize_t c_delta_size
527
497
cdef Py_ssize_t c_delta_start, c_delta_end
529
if not PyString_CheckExact(source):
499
if not PyBytes_CheckExact(source):
530
500
raise TypeError('source is not a str')
531
c_source_size = PyString_GET_SIZE(source)
501
c_source_size = PyBytes_GET_SIZE(source)
532
502
c_delta_start = delta_start
533
503
c_delta_end = delta_end
534
504
if c_delta_start >= c_source_size:
539
509
raise ValueError('delta starts after it ends')
541
511
c_delta_size = c_delta_end - c_delta_start
542
c_source = PyString_AS_STRING(source)
512
c_source = PyBytes_AS_STRING(source)
543
513
c_delta = c_source + c_delta_start
544
514
# We don't use source_size, because we know the delta should not refer to
545
515
# any bytes after it starts
563
533
raise ValueError('encode_base128_int overflowed the buffer')
564
534
c_bytes[count] = <unsigned char>(c_val & 0xFF)
565
535
count = count + 1
566
return PyString_FromStringAndSize(<char *>c_bytes, count)
569
def decode_base128_int(bytes):
536
return PyBytes_FromStringAndSize(<char *>c_bytes, count)
539
def decode_base128_int(data):
570
540
"""Decode an integer from a 7-bit lsb encoding."""
581
if not PyString_CheckExact(bytes):
551
if not PyBytes_CheckExact(data):
582
552
raise TypeError('bytes is not a string')
583
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
553
c_bytes = <unsigned char*>PyBytes_AS_STRING(data)
584
554
# We take off 1, because we have to be able to decode the non-expanded byte
585
num_low_bytes = PyString_GET_SIZE(bytes) - 1
555
num_low_bytes = PyBytes_GET_SIZE(data) - 1
586
556
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
587
557
val = val | ((c_bytes[offset] & 0x7F) << shift)
588
558
shift = shift + 7