1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Compiled extensions for doing compression."""
19
from __future__ import absolute_import
22
cdef extern from "python-compat.h":
26
cdef extern from "Python.h":
27
ctypedef struct PyObject:
29
int PyBytes_CheckExact(object)
30
char * PyBytes_AS_STRING(object)
31
Py_ssize_t PyBytes_GET_SIZE(object)
32
object PyBytes_FromStringAndSize(char *, Py_ssize_t)
36
ctypedef unsigned long size_t
37
void * malloc(size_t) nogil
38
void * realloc(void *, size_t) nogil
39
void free(void *) nogil
40
void memcpy(void *, void *, size_t) nogil
43
cdef extern from "delta.h":
47
unsigned long agg_offset
50
ctypedef enum delta_result:
58
delta_result create_delta_index(source_info *src,
61
int max_entries) nogil
62
delta_result create_delta_index_from_delta(source_info *delta,
64
delta_index **fresh) nogil
65
void free_delta_index(delta_index *index) nogil
66
delta_result create_delta(delta_index *indexes,
67
void *buf, unsigned long bufsize,
68
unsigned long *delta_size,
69
unsigned long max_delta_size,
70
void **delta_data) nogil
71
unsigned long get_delta_hdr_size(unsigned char **datap,
72
unsigned char *top) nogil
73
unsigned long sizeof_delta_index(delta_index *index)
74
Py_ssize_t DELTA_SIZE_MIN
75
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
76
int get_entry_summary(delta_index *index, int pos,
77
unsigned int *global_offset, unsigned int *hash_val)
78
unsigned int rabin_hash (unsigned char *data)
81
cdef void *safe_malloc(size_t count) except NULL:
83
result = malloc(count)
85
raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
89
cdef void *safe_realloc(void * old, size_t count) except NULL:
91
result = realloc(old, count)
93
raise MemoryError('Failed to reallocate to %d bytes of memory'
98
cdef int safe_free(void **val) except -1:
104
def make_delta_index(source):
105
return DeltaIndex(source)
108
cdef object _translate_delta_failure(delta_result result):
109
if result == DELTA_OUT_OF_MEMORY:
110
return MemoryError("Delta function failed to allocate memory")
111
elif result == DELTA_INDEX_NEEDED:
112
return ValueError("Delta function requires delta_index param")
113
elif result == DELTA_SOURCE_EMPTY:
114
return ValueError("Delta function given empty source_info param")
115
elif result == DELTA_SOURCE_BAD:
116
return RuntimeError("Delta function given invalid source_info param")
117
elif result == DELTA_BUFFER_EMPTY:
118
return ValueError("Delta function given empty buffer params")
119
return AssertionError("Unrecognised delta result code: %d" % result)
122
def _rabin_hash(content):
123
if not PyBytes_CheckExact(content):
124
raise ValueError('content must be a string')
125
if len(content) < 16:
126
raise ValueError('content must be at least 16 bytes long')
127
# Try to cast it to an int, if it can fit
128
return int(rabin_hash(<unsigned char*>(PyBytes_AS_STRING(content))))
131
cdef class DeltaIndex:
133
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
134
# isn't performance critical
135
# cdef readonly list _sources
136
cdef readonly object _sources
137
cdef source_info *_source_infos
138
cdef delta_index *_index
139
cdef public unsigned long _source_offset
140
cdef readonly unsigned int _max_num_sources
141
cdef public int _max_bytes_to_index
143
def __init__(self, source=None, max_bytes_to_index=None):
146
self._max_num_sources = 65000
147
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
148
* self._max_num_sources)
149
self._source_offset = 0
150
self._max_bytes_to_index = 0
151
if max_bytes_to_index is not None:
152
self._max_bytes_to_index = max_bytes_to_index
154
if source is not None:
155
self.add_source(source, 0)
157
def __sizeof__(self):
158
# We want to track the _source_infos allocations, but the referenced
159
# void* are actually tracked in _sources itself.
160
# XXX: Cython is capable of doing sizeof(class) and returning the size
161
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
162
# to do it manually. *sigh* Note that we might get it wrong
163
# because of alignment issues.
165
# PyObject start, vtable *, 3 object pointers, 2 C ints
166
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
167
+ sizeof(unsigned long)
168
+ sizeof(unsigned int))
169
+ (sizeof(source_info) * self._max_num_sources)
170
+ sizeof_delta_index(self._index))
174
return '%s(%d, %d)' % (self.__class__.__name__,
175
len(self._sources), self._source_offset)
177
def __dealloc__(self):
178
if self._index != NULL:
179
free_delta_index(self._index)
181
safe_free(<void **>&self._source_infos)
183
def _has_index(self):
184
return (self._index != NULL)
186
def _dump_index(self):
187
"""Dump the pointers in the index.
189
This is an arbitrary layout, used for testing. It is not meant to be
190
used in production code.
192
:return: (hash_list, entry_list)
193
hash_list A list of offsets, so hash[i] points to the 'hash
194
bucket' starting at the given offset and going until
196
entry_list A list of (text_offset, hash_val). text_offset is the
197
offset in the "source" texts, and hash_val is the RABIN
198
hash for that offset.
199
Note that the entry should be in the hash bucket
201
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
204
cdef unsigned int text_offset
205
cdef unsigned int hash_val
206
cdef unsigned int hash_offset
207
if self._index == NULL:
211
while get_hash_offset(self._index, pos, &hash_offset):
212
hash_list.append(int(hash_offset))
216
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
217
# Map back using 'int' so that we don't get Long everywhere, when
218
# almost everything is <2**31.
219
val = tuple(map(int, [text_offset, hash_val]))
220
entry_list.append(val)
222
return hash_list, entry_list
224
def add_delta_source(self, delta, unadded_bytes):
225
"""Add a new delta to the source texts.
227
:param delta: The text of the delta, this must be a byte string.
228
:param unadded_bytes: Number of bytes that were added to the source
229
that were not indexed.
232
cdef Py_ssize_t c_delta_size
233
cdef delta_index *index
234
cdef delta_result res
235
cdef unsigned int source_location
236
cdef source_info *src
237
cdef unsigned int num_indexes
239
if not PyBytes_CheckExact(delta):
240
raise TypeError('delta is not a bytestring')
242
source_location = len(self._sources)
243
if source_location >= self._max_num_sources:
244
self._expand_sources()
245
self._sources.append(delta)
246
c_delta = PyBytes_AS_STRING(delta)
247
c_delta_size = PyBytes_GET_SIZE(delta)
248
src = self._source_infos + source_location
250
src.size = c_delta_size
251
src.agg_offset = self._source_offset + unadded_bytes
253
res = create_delta_index_from_delta(src, self._index, &index)
255
raise _translate_delta_failure(res)
256
self._source_offset = src.agg_offset + src.size
257
if index != self._index:
258
free_delta_index(self._index)
261
def add_source(self, source, unadded_bytes):
262
"""Add a new bit of source text to the delta indexes.
264
:param source: The text in question, this must be a byte string
265
:param unadded_bytes: Assume there are this many bytes that didn't get
266
added between this source and the end of the previous source.
267
:param max_pointers: Add no more than this many entries to the index.
268
By default, we sample every 16 bytes, if that would require more
269
than max_entries, we will reduce the sampling rate.
270
A value of 0 means unlimited, None means use the default limit.
273
cdef Py_ssize_t c_source_size
274
cdef delta_index *index
275
cdef delta_result res
276
cdef unsigned int source_location
277
cdef source_info *src
278
cdef unsigned int num_indexes
279
cdef int max_num_entries
281
if not PyBytes_CheckExact(source):
282
raise TypeError('source is not a bytestring')
284
source_location = len(self._sources)
285
if source_location >= self._max_num_sources:
286
self._expand_sources()
287
if source_location != 0 and self._index == NULL:
288
# We were lazy about populating the index, create it now
289
self._populate_first_index()
290
self._sources.append(source)
291
c_source = PyBytes_AS_STRING(source)
292
c_source_size = PyBytes_GET_SIZE(source)
293
src = self._source_infos + source_location
295
src.size = c_source_size
297
src.agg_offset = self._source_offset + unadded_bytes
298
self._source_offset = src.agg_offset + src.size
299
# We delay creating the index on the first insert
300
if source_location != 0:
302
res = create_delta_index(src, self._index, &index,
303
self._max_bytes_to_index)
305
raise _translate_delta_failure(res)
306
if index != self._index:
307
free_delta_index(self._index)
310
cdef _populate_first_index(self):
311
cdef delta_index *index
312
cdef delta_result res
313
if len(self._sources) != 1 or self._index != NULL:
314
raise AssertionError('_populate_first_index should only be'
315
' called when we have a single source and no index yet')
317
# We know that self._index is already NULL, so create_delta_index
318
# will always create a new index unless there's a malloc failure
320
res = create_delta_index(&self._source_infos[0], NULL, &index,
321
self._max_bytes_to_index)
323
raise _translate_delta_failure(res)
326
cdef _expand_sources(self):
327
raise RuntimeError('if we move self._source_infos, then we need to'
328
' change all of the index pointers as well.')
329
self._max_num_sources = self._max_num_sources * 2
330
self._source_infos = <source_info *>safe_realloc(self._source_infos,
332
* self._max_num_sources)
334
def make_delta(self, target_bytes, max_delta_size=0):
335
"""Create a delta from the current source to the target bytes."""
337
cdef Py_ssize_t target_size
339
cdef unsigned long delta_size
340
cdef unsigned long c_max_delta_size
341
cdef delta_result res
343
if self._index == NULL:
344
if len(self._sources) == 0:
346
# We were just lazy about generating the index
347
self._populate_first_index()
349
if not PyBytes_CheckExact(target_bytes):
350
raise TypeError('target is not a bytestring')
352
target = PyBytes_AS_STRING(target_bytes)
353
target_size = PyBytes_GET_SIZE(target_bytes)
355
# TODO: inline some of create_delta so we at least don't have to double
356
# malloc, and can instead use PyBytes_FromStringAndSize, to
357
# allocate the bytes into the final string
358
c_max_delta_size = max_delta_size
360
res = create_delta(self._index, target, target_size,
361
&delta_size, c_max_delta_size, &delta)
364
result = PyBytes_FromStringAndSize(<char *>delta, delta_size)
366
elif res != DELTA_SIZE_TOO_BIG:
367
raise _translate_delta_failure(res)
371
def make_delta(source_bytes, target_bytes):
372
"""Create a delta, this is a wrapper around DeltaIndex.make_delta."""
373
di = DeltaIndex(source_bytes)
374
return di.make_delta(target_bytes)
377
def apply_delta(source_bytes, delta_bytes):
378
"""Apply a delta generated by make_delta to source_bytes."""
380
cdef Py_ssize_t source_size
382
cdef Py_ssize_t delta_size
384
if not PyBytes_CheckExact(source_bytes):
385
raise TypeError('source is not a bytestring')
386
if not PyBytes_CheckExact(delta_bytes):
387
raise TypeError('delta is not a bytestring')
388
source = PyBytes_AS_STRING(source_bytes)
389
source_size = PyBytes_GET_SIZE(source_bytes)
390
delta = PyBytes_AS_STRING(delta_bytes)
391
delta_size = PyBytes_GET_SIZE(delta_bytes)
392
# Code taken from patch-delta.c, only brought here to give better error
393
# handling, and to avoid double allocating memory
394
if (delta_size < DELTA_SIZE_MIN):
395
# XXX: Invalid delta block
396
raise RuntimeError('delta_size %d smaller than min delta size %d'
397
% (delta_size, DELTA_SIZE_MIN))
399
return _apply_delta(source, source_size, delta, delta_size)
402
cdef unsigned char *_decode_copy_instruction(unsigned char *bytes,
403
unsigned char cmd, unsigned int *offset,
404
unsigned int *length) nogil: # cannot_raise
405
"""Decode a copy instruction from the next few bytes.
407
A copy instruction is a variable number of bytes, so we will parse the
408
bytes we care about, and return the new position, as well as the offset and
409
length referred to in the bytes.
411
:param bytes: Pointer to the start of bytes after cmd
412
:param cmd: The command code
413
:return: Pointer to the bytes just after the last decode byte
415
cdef unsigned int off, size, count
423
off = off | (bytes[count] << 8)
426
off = off | (bytes[count] << 16)
429
off = off | (bytes[count] << 24)
435
size = size | (bytes[count] << 8)
438
size = size | (bytes[count] << 16)
447
cdef object _apply_delta(char *source, Py_ssize_t source_size,
448
char *delta, Py_ssize_t delta_size):
449
"""common functionality between apply_delta and apply_delta_to_source."""
450
cdef unsigned char *data, *top
451
cdef unsigned char *dst_buf, *out, cmd
453
cdef unsigned int cp_off, cp_size
456
data = <unsigned char *>delta
457
top = data + delta_size
459
# now the result size
460
size = get_delta_hdr_size(&data, top)
461
result = PyBytes_FromStringAndSize(NULL, size)
462
dst_buf = <unsigned char*>PyBytes_AS_STRING(result)
472
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
473
if (cp_off + cp_size < cp_size or
474
cp_off + cp_size > <unsigned int>source_size or
475
cp_size > <unsigned int>size):
478
memcpy(out, source + cp_off, cp_size)
480
size = size - cp_size
484
# cmd == 0 is reserved for future encoding
485
# extensions. In the mean time we must fail when
486
# encountering them (might be data corruption).
492
memcpy(out, data, cmd)
498
raise ValueError('Something wrong with:'
499
' cp_off = %s, cp_size = %s'
500
' source_size = %s, size = %s'
501
% (cp_off, cp_size, source_size, size))
503
raise ValueError('Got delta opcode: 0, not supported')
505
raise ValueError('Insert instruction longer than remaining'
506
' bytes: %d > %d' % (cmd, size))
509
if (data != top or size != 0):
510
raise RuntimeError('Did not extract the number of bytes we expected'
511
' we were left with %d bytes in "size", and top - data = %d'
512
% (size, <int>(top - data)))
515
# *dst_size = out - dst_buf;
516
if (out - dst_buf) != PyBytes_GET_SIZE(result):
517
raise RuntimeError('Number of bytes extracted did not match the'
518
' size encoded in the delta header.')
522
def apply_delta_to_source(source, delta_start, delta_end):
523
"""Extract a delta from source bytes, and apply it."""
525
cdef Py_ssize_t c_source_size
527
cdef Py_ssize_t c_delta_size
528
cdef Py_ssize_t c_delta_start, c_delta_end
530
if not PyBytes_CheckExact(source):
531
raise TypeError('source is not a str')
532
c_source_size = PyBytes_GET_SIZE(source)
533
c_delta_start = delta_start
534
c_delta_end = delta_end
535
if c_delta_start >= c_source_size:
536
raise ValueError('delta starts after source')
537
if c_delta_end > c_source_size:
538
raise ValueError('delta ends after source')
539
if c_delta_start >= c_delta_end:
540
raise ValueError('delta starts after it ends')
542
c_delta_size = c_delta_end - c_delta_start
543
c_source = PyBytes_AS_STRING(source)
544
c_delta = c_source + c_delta_start
545
# We don't use source_size, because we know the delta should not refer to
546
# any bytes after it starts
547
return _apply_delta(c_source, c_delta_start, c_delta, c_delta_size)
550
def encode_base128_int(val):
551
"""Convert an integer into a 7-bit lsb encoding."""
552
cdef unsigned int c_val
553
cdef Py_ssize_t count
554
cdef unsigned int num_bytes
555
cdef unsigned char c_bytes[8] # max size for 32-bit int is 5 bytes
559
while c_val >= 0x80 and count < 8:
560
c_bytes[count] = <unsigned char>((c_val | 0x80) & 0xFF)
563
if count >= 8 or c_val >= 0x80:
564
raise ValueError('encode_base128_int overflowed the buffer')
565
c_bytes[count] = <unsigned char>(c_val & 0xFF)
567
return PyBytes_FromStringAndSize(<char *>c_bytes, count)
570
def decode_base128_int(bytes):
571
"""Decode an integer from a 7-bit lsb encoding."""
574
cdef unsigned int uval
576
cdef Py_ssize_t num_low_bytes
577
cdef unsigned char *c_bytes
582
if not PyBytes_CheckExact(bytes):
583
raise TypeError('bytes is not a string')
584
c_bytes = <unsigned char*>PyBytes_AS_STRING(bytes)
585
# We take off 1, because we have to be able to decode the non-expanded byte
586
num_low_bytes = PyBytes_GET_SIZE(bytes) - 1
587
while (c_bytes[offset] & 0x80) and offset < num_low_bytes:
588
val = val | ((c_bytes[offset] & 0x7F) << shift)
591
if c_bytes[offset] & 0x80:
592
raise ValueError('Data not properly formatted, we ran out of'
593
' bytes before 0x80 stopped being set.')
594
val = val | (c_bytes[offset] << shift)
597
uval = <unsigned int> val