1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
19
from __future__ import absolute_import
22
cdef extern from "python-compat.h":
26
cdef extern from "Python.h":
27
ctypedef struct PyObject:
29
ctypedef int Py_ssize_t # Required for older pyrex versions
30
int PyString_CheckExact(object)
31
char * PyString_AS_STRING(object)
32
Py_ssize_t PyString_GET_SIZE(object)
33
object PyString_FromStringAndSize(char *, Py_ssize_t)
37
ctypedef unsigned long size_t
38
void * malloc(size_t) nogil
39
void * realloc(void *, size_t) nogil
40
void free(void *) nogil
41
void memcpy(void *, void *, size_t) nogil
44
cdef extern from "delta.h":
48
unsigned long agg_offset
51
ctypedef enum delta_result:
59
delta_result create_delta_index(source_info *src,
62
int max_entries) nogil
63
delta_result create_delta_index_from_delta(source_info *delta,
65
delta_index **fresh) nogil
66
void free_delta_index(delta_index *index) nogil
67
delta_result create_delta(delta_index *indexes,
68
void *buf, unsigned long bufsize,
69
unsigned long *delta_size,
70
unsigned long max_delta_size,
71
void **delta_data) nogil
72
unsigned long get_delta_hdr_size(unsigned char **datap,
73
unsigned char *top) nogil
74
unsigned long sizeof_delta_index(delta_index *index)
75
Py_ssize_t DELTA_SIZE_MIN
76
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
77
int get_entry_summary(delta_index *index, int pos,
78
unsigned int *global_offset, unsigned int *hash_val)
79
unsigned int rabin_hash (unsigned char *data)
82
cdef void *safe_malloc(size_t count) except NULL:
84
result = malloc(count)
86
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
90
cdef void *safe_realloc(void * old, size_t count) except NULL:
92
result = realloc(old, count)
94
raise MemoryError('Failed to reallocate to %d bytes of memory'
99
cdef int safe_free(void **val) except -1:
105
def make_delta_index(source):
106
return DeltaIndex(source)
109
cdef object _translate_delta_failure(delta_result result):
110
if result == DELTA_OUT_OF_MEMORY:
111
return MemoryError("Delta function failed to allocate memory")
112
elif result == DELTA_INDEX_NEEDED:
113
return ValueError("Delta function requires delta_index param")
114
elif result == DELTA_SOURCE_EMPTY:
115
return ValueError("Delta function given empty source_info param")
116
elif result == DELTA_SOURCE_BAD:
117
return RuntimeError("Delta function given invalid source_info param")
118
elif result == DELTA_BUFFER_EMPTY:
119
return ValueError("Delta function given empty buffer params")
120
return AssertionError("Unrecognised delta result code: %d" % result)
123
def _rabin_hash(content):
124
if not PyString_CheckExact(content):
125
raise ValueError('content must be a string')
126
if len(content) < 16:
127
raise ValueError('content must be at least 16 bytes long')
128
# Try to cast it to an int, if it can fit
129
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
132
cdef class DeltaIndex:
134
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
135
# isn't performance critical
136
# cdef readonly list _sources
137
cdef readonly object _sources
138
cdef source_info *_source_infos
139
cdef delta_index *_index
140
cdef public unsigned long _source_offset
141
cdef readonly unsigned int _max_num_sources
142
cdef public int _max_bytes_to_index
144
def __init__(self, source=None, max_bytes_to_index=None):
147
self._max_num_sources = 65000
148
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
149
* self._max_num_sources)
150
self._source_offset = 0
151
self._max_bytes_to_index = 0
152
if max_bytes_to_index is not None:
153
self._max_bytes_to_index = max_bytes_to_index
155
if source is not None:
156
self.add_source(source, 0)
158
def __sizeof__(self):
159
# We want to track the _source_infos allocations, but the referenced
160
# void* are actually tracked in _sources itself.
161
# XXX: Cython is capable of doing sizeof(class) and returning the size
162
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
163
# to do it manually. *sigh* Note that we might get it wrong
164
# because of alignment issues.
166
# PyObject start, vtable *, 3 object pointers, 2 C ints
167
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
168
+ sizeof(unsigned long)
169
+ sizeof(unsigned int))
170
+ (sizeof(source_info) * self._max_num_sources)
171
+ sizeof_delta_index(self._index))
175
return '%s(%d, %d)' % (self.__class__.__name__,
176
len(self._sources), self._source_offset)
178
def __dealloc__(self):
179
if self._index != NULL:
180
free_delta_index(self._index)
182
safe_free(<void **>&self._source_infos)
184
def _has_index(self):
185
return (self._index != NULL)
187
def _dump_index(self):
188
"""Dump the pointers in the index.
190
This is an arbitrary layout, used for testing. It is not meant to be
191
used in production code.
193
:return: (hash_list, entry_list)
194
hash_list A list of offsets, so hash[i] points to the 'hash
195
bucket' starting at the given offset and going until
197
entry_list A list of (text_offset, hash_val). text_offset is the
198
offset in the "source" texts, and hash_val is the RABIN
199
hash for that offset.
200
Note that the entry should be in the hash bucket
202
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
205
cdef unsigned int text_offset
206
cdef unsigned int hash_val
207
cdef unsigned int hash_offset
208
if self._index == NULL:
212
while get_hash_offset(self._index, pos, &hash_offset):
213
hash_list.append(int(hash_offset))
217
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
218
# Map back using 'int' so that we don't get Long everywhere, when
219
# almost everything is <2**31.
220
val = tuple(map(int, [text_offset, hash_val]))
221
entry_list.append(val)
223
return hash_list, entry_list
225
def add_delta_source(self, delta, unadded_bytes):
226
"""Add a new delta to the source texts.
228
:param delta: The text of the delta, this must be a byte string.
229
:param unadded_bytes: Number of bytes that were added to the source
230
that were not indexed.
233
cdef Py_ssize_t c_delta_size
234
cdef delta_index *index
235
cdef delta_result res
236
cdef unsigned int source_location
237
cdef source_info *src
238
cdef unsigned int num_indexes
240
if not PyString_CheckExact(delta):
241
raise TypeError('delta is not a str')
243
source_location = len(self._sources)
244
if source_location >= self._max_num_sources:
245
self._expand_sources()
246
self._sources.append(delta)
247
c_delta = PyString_AS_STRING(delta)
248
c_delta_size = PyString_GET_SIZE(delta)
249
src = self._source_infos + source_location
251
src.size = c_delta_size
252
src.agg_offset = self._source_offset + unadded_bytes
254
res = create_delta_index_from_delta(src, self._index, &index)
256
raise _translate_delta_failure(res)
257
self._source_offset = src.agg_offset + src.size
258
if index != self._index:
259
free_delta_index(self._index)
262
def add_source(self, source, unadded_bytes):
263
"""Add a new bit of source text to the delta indexes.
265
:param source: The text in question, this must be a byte string
266
:param unadded_bytes: Assume there are this many bytes that didn't get
267
added between this source and the end of the previous source.
268
:param max_pointers: Add no more than this many entries to the index.
269
By default, we sample every 16 bytes, if that would require more
270
than max_entries, we will reduce the sampling rate.
271
A value of 0 means unlimited, None means use the default limit.
274
cdef Py_ssize_t c_source_size
275
cdef delta_index *index
276
cdef delta_result res
277
cdef unsigned int source_location
278
cdef source_info *src
279
cdef unsigned int num_indexes
280
cdef int max_num_entries
282
if not PyString_CheckExact(source):
283
raise TypeError('source is not a str')
285
source_location = len(self._sources)
286
if source_location >= self._max_num_sources:
287
self._expand_sources()
288
if source_location != 0 and self._index == NULL:
289
# We were lazy about populating the index, create it now
290
self._populate_first_index()
291
self._sources.append(source)
292
c_source = PyString_AS_STRING(source)
293
c_source_size = PyString_GET_SIZE(source)
294
src = self._source_infos + source_location
296
src.size = c_source_size
298
src.agg_offset = self._source_offset + unadded_bytes
299
self._source_offset = src.agg_offset + src.size
300
# We delay creating the index on the first insert
301
if source_location != 0:
303
res = create_delta_index(src, self._index, &index,
304
self._max_bytes_to_index)
306
raise _translate_delta_failure(res)
307
if index != self._index:
308
free_delta_index(self._index)
311
cdef _populate_first_index(self):
312
cdef delta_index *index
313
cdef delta_result res
314
if len(self._sources) != 1 or self._index != NULL:
315
raise AssertionError('_populate_first_index should only be'
316
' called when we have a single source and no index yet')
318
# We know that self._index is already NULL, so create_delta_index
319
# will always create a new index unless there's a malloc failure
321
res = create_delta_index(&self._source_infos[0], NULL, &index,
322
self._max_bytes_to_index)
324
raise _translate_delta_failure(res)
327
cdef _expand_sources(self):
328
raise RuntimeError('if we move self._source_infos, then we need to'
329
' change all of the index pointers as well.')
330
self._max_num_sources = self._max_num_sources * 2
331
self._source_infos = <source_info *>safe_realloc(self._source_infos,
333
* self._max_num_sources)
335
def make_delta(self, target_bytes, max_delta_size=0):
336
"""Create a delta from the current source to the target bytes."""
338
cdef Py_ssize_t target_size
340
cdef unsigned long delta_size
341
cdef unsigned long c_max_delta_size
342
cdef delta_result res
344
if self._index == NULL:
345
if len(self._sources) == 0:
347
# We were just lazy about generating the index
348
self._populate_first_index()
350
if not PyString_CheckExact(target_bytes):
351
raise TypeError('target is not a str')
353
target = PyString_AS_STRING(target_bytes)
354
target_size = PyString_GET_SIZE(target_bytes)
356
# TODO: inline some of create_delta so we at least don't have to double
357
# malloc, and can instead use PyString_FromStringAndSize, to
358
# allocate the bytes into the final string
359
c_max_delta_size = max_delta_size
361
res = create_delta(self._index, target, target_size,
362
&delta_size, c_max_delta_size, &delta)
365
result = PyString_FromStringAndSize(<char *>delta, delta_size)
367
elif res != DELTA_SIZE_TOO_BIG:
368
raise _translate_delta_failure(res)
372
def make_delta(source_bytes, target_bytes):
373
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
374
di = DeltaIndex(source_bytes)
375
return di.make_delta(target_bytes)
378
def apply_delta(source_bytes, delta_bytes):
379
"""Apply a delta generated by make_delta to source_bytes."""
381
cdef Py_ssize_t source_size
383
cdef Py_ssize_t delta_size
385
if not PyString_CheckExact(source_bytes):
386
raise TypeError('source is not a str')
387
if not PyString_CheckExact(delta_bytes):
388
raise TypeError('delta is not a str')
389
source = PyString_AS_STRING(source_bytes)
390
source_size = PyString_GET_SIZE(source_bytes)
391
delta = PyString_AS_STRING(delta_bytes)
392
delta_size = PyString_GET_SIZE(delta_bytes)
393
# Code taken from patch-delta.c, only brought here to give better error
394
# handling, and to avoid double allocating memory
395
if (delta_size < DELTA_SIZE_MIN):
396
# XXX: Invalid delta block
397
raise RuntimeError('delta_size %d smaller than min delta size %d'
398
% (delta_size, DELTA_SIZE_MIN))
400
return _apply_delta(source, source_size, delta, delta_size)
403
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
404
unsigned char cmd, unsigned int *offset,
405
unsigned int *length) nogil: # cannot_raise
406
"""Decode a copy instruction from the next few bytes.
408
A copy instruction is a variable number of bytes, so we will parse the
409
bytes we care about, and return the new position, as well as the offset and
410
length referred to in the bytes.
412
:param bytes: Pointer to the start of bytes after cmd
413
:param cmd: The command code
414
:return: Pointer to the bytes just after the last decode byte
416
cdef unsigned int off, size, count
424
off = off | (bytes[count] << 8)
427
off = off | (bytes[count] << 16)
430
off = off | (bytes[count] << 24)
436
size = size | (bytes[count] << 8)
439
size = size | (bytes[count] << 16)
448
cdef object _apply_delta(char *source, Py_ssize_t source_size,
449
char *delta, Py_ssize_t delta_size):
450
"""common functionality between apply_delta and apply_delta_to_source."""
451
cdef unsigned char *data, *top
452
cdef unsigned char *dst_buf, *out, cmd
454
cdef unsigned int cp_off, cp_size
457
data = <unsigned char *>delta
458
top = data + delta_size
460
# now the result size
461
size = get_delta_hdr_size(&data, top)
462
result = PyString_FromStringAndSize(NULL, size)
463
dst_buf = <unsigned char*>PyString_AS_STRING(result)
473
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
474
if (cp_off + cp_size < cp_size or
475
cp_off + cp_size > <unsigned int>source_size or
476
cp_size > <unsigned int>size):
479
memcpy(out, source + cp_off, cp_size)
481
size = size - cp_size
485
# cmd == 0 is reserved for future encoding
486
# extensions. In the mean time we must fail when
487
# encountering them (might be data corruption).
493
memcpy(out, data, cmd)
499
raise ValueError('Something wrong with:'
500
' cp_off = %s, cp_size = %s'
501
' source_size = %s, size = %s'
502
% (cp_off, cp_size, source_size, size))
504
raise ValueError('Got delta opcode: 0, not supported')
506
raise ValueError('Insert instruction longer than remaining'
507
' bytes: %d > %d' % (cmd, size))
510
if (data != top or size != 0):
511
raise RuntimeError('Did not extract the number of bytes we expected'
512
' we were left with %d bytes in "size", and top - data = %d'
513
% (size, <int>(top - data)))
516
# *dst_size = out - dst_buf;
517
if (out - dst_buf) != PyString_GET_SIZE(result):
518
raise RuntimeError('Number of bytes extracted did not match the'
519
' size encoded in the delta header.')
523
def apply_delta_to_source(source, delta_start, delta_end):
524
"""Extract a delta from source bytes, and apply it."""
526
cdef Py_ssize_t c_source_size
528
cdef Py_ssize_t c_delta_size
529
cdef Py_ssize_t c_delta_start, c_delta_end
531
if not PyString_CheckExact(source):
532
raise TypeError('source is not a str')
533
c_source_size = PyString_GET_SIZE(source)
534
c_delta_start = delta_start
535
c_delta_end = delta_end
536
if c_delta_start >= c_source_size:
537
raise ValueError('delta starts after source')
538
if c_delta_end > c_source_size:
539
raise ValueError('delta ends after source')
540
if c_delta_start >= c_delta_end:
541
raise ValueError('delta starts after it ends')
543
c_delta_size = c_delta_end - c_delta_start
544
c_source = PyString_AS_STRING(source)
545
c_delta = c_source + c_delta_start
546
# We don't use source_size, because we know the delta should not refer to
547
# any bytes after it starts
548
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
551
def encode_base128_int(val):
552
"""Convert an integer into a 7-bit lsb encoding."""
553
cdef unsigned int c_val
554
cdef Py_ssize_t count
555
cdef unsigned int num_bytes
556
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
560
while c_val >= 0x80 and count < 8:
561
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
564
if count >= 8 or c_val >= 0x80:
565
raise ValueError('encode_base128_int overflowed the buffer')
566
c_bytes[count] = <unsigned char>(c_val & 0xFF)
568
return PyString_FromStringAndSize(<char *>c_bytes, count)
571
def decode_base128_int(bytes):
572
"""Decode an integer from a 7-bit lsb encoding."""
575
cdef unsigned int uval
577
cdef Py_ssize_t num_low_bytes
578
cdef unsigned char *c_bytes
583
if not PyString_CheckExact(bytes):
584
raise TypeError('bytes is not a string')
585
c_bytes = <unsigned char*>PyString_AS_STRING(bytes)
586
# We take off 1, because we have to be able to decode the non-expanded byte
587
num_low_bytes = PyString_GET_SIZE(bytes) - 1
588
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
589
val = val | ((c_bytes[offset] & 0x7F) << shift)
592
if c_bytes[offset] & 0x80:
593
raise ValueError('Data not properly formatted, we ran out of'
594
' bytes before 0x80 stopped being set.')
595
val = val | (c_bytes[offset] << shift)
598
uval = <unsigned int> val