17
17
"""Compiled extensions for doing compression."""
19
from __future__ import absolute_import
20
22
cdef extern from "python-compat.h":
24
26
cdef extern from "Python.h":
27
ctypedef struct PyObject:
25
29
ctypedef int Py_ssize_t # Required for older pyrex versions
26
30
int PyString_CheckExact(object)
27
31
char * PyString_AS_STRING(object)
44
48
unsigned long agg_offset
45
49
struct delta_index:
47
delta_index * create_delta_index(source_info *src, delta_index *old) nogil
48
delta_index * create_delta_index_from_delta(source_info *delta,
49
delta_index *old) nogil
51
ctypedef enum delta_result:
59
delta_result create_delta_index(source_info *src,
62
int max_entries) nogil
63
delta_result create_delta_index_from_delta(source_info *delta,
65
delta_index **fresh) nogil
50
66
void free_delta_index(delta_index *index) nogil
51
void *create_delta(delta_index *indexes,
52
void *buf, unsigned long bufsize,
53
unsigned long *delta_size, unsigned long max_delta_size) nogil
67
delta_result create_delta(delta_index *indexes,
68
void *buf, unsigned long bufsize,
69
unsigned long *delta_size,
70
unsigned long max_delta_size,
71
void **delta_data) nogil
54
72
unsigned long get_delta_hdr_size(unsigned char **datap,
55
73
unsigned char *top) nogil
74
unsigned long sizeof_delta_index(delta_index *index)
56
75
Py_ssize_t DELTA_SIZE_MIN
76
int get_hash_offset(delta_index *index, int pos, unsigned int *hash_offset)
77
int get_entry_summary(delta_index *index, int pos,
78
unsigned int *global_offset, unsigned int *hash_val)
79
unsigned int rabin_hash (unsigned char *data)
59
82
cdef void *safe_malloc(size_t count) except NULL:
83
106
return DeltaIndex(source)
109
cdef object _translate_delta_failure(delta_result result):
110
if result == DELTA_OUT_OF_MEMORY:
111
return MemoryError("Delta function failed to allocate memory")
112
elif result == DELTA_INDEX_NEEDED:
113
return ValueError("Delta function requires delta_index param")
114
elif result == DELTA_SOURCE_EMPTY:
115
return ValueError("Delta function given empty source_info param")
116
elif result == DELTA_SOURCE_BAD:
117
return RuntimeError("Delta function given invalid source_info param")
118
elif result == DELTA_BUFFER_EMPTY:
119
return ValueError("Delta function given empty buffer params")
120
return AssertionError("Unrecognised delta result code: %d" % result)
123
def _rabin_hash(content):
124
if not PyString_CheckExact(content):
125
raise ValueError('content must be a string')
126
if len(content) < 16:
127
raise ValueError('content must be at least 16 bytes long')
128
# Try to cast it to an int, if it can fit
129
return int(rabin_hash(<unsigned char*>(PyString_AS_STRING(content))))
86
132
cdef class DeltaIndex:
88
134
# We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
91
137
cdef readonly object _sources
92
138
cdef source_info *_source_infos
93
139
cdef delta_index *_index
140
cdef public unsigned long _source_offset
94
141
cdef readonly unsigned int _max_num_sources
95
cdef public unsigned long _source_offset
142
cdef public int _max_bytes_to_index
97
def __init__(self, source=None):
144
def __init__(self, source=None, max_bytes_to_index=None):
98
145
self._sources = []
99
146
self._index = NULL
100
147
self._max_num_sources = 65000
101
148
self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
102
149
* self._max_num_sources)
103
150
self._source_offset = 0
151
self._max_bytes_to_index = 0
152
if max_bytes_to_index is not None:
153
self._max_bytes_to_index = max_bytes_to_index
105
155
if source is not None:
106
156
self.add_source(source, 0)
158
def __sizeof__(self):
159
# We want to track the _source_infos allocations, but the referenced
160
# void* are actually tracked in _sources itself.
161
# XXX: Cython is capable of doing sizeof(class) and returning the size
162
# of the underlying struct. Pyrex (<= 0.9.9) refuses, so we need
163
# to do it manually. *sigh* Note that we might get it wrong
164
# because of alignment issues.
166
# PyObject start, vtable *, 3 object pointers, 2 C ints
167
size = ((sizeof(PyObject) + sizeof(void*) + 3*sizeof(PyObject*)
168
+ sizeof(unsigned long)
169
+ sizeof(unsigned int))
170
+ (sizeof(source_info) * self._max_num_sources)
171
+ sizeof_delta_index(self._index))
108
174
def __repr__(self):
109
175
return '%s(%d, %d)' % (self.__class__.__name__,
110
176
len(self._sources), self._source_offset)
118
184
def _has_index(self):
119
185
return (self._index != NULL)
187
def _dump_index(self):
188
"""Dump the pointers in the index.
190
This is an arbitrary layout, used for testing. It is not meant to be
191
used in production code.
193
:return: (hash_list, entry_list)
194
hash_list A list of offsets, so hash[i] points to the 'hash
195
bucket' starting at the given offset and going until
197
entry_list A list of (text_offset, hash_val). text_offset is the
198
offset in the "source" texts, and hash_val is the RABIN
199
hash for that offset.
200
Note that the entry should be in the hash bucket
202
hash[(hash_val & mask)] && hash[(hash_val & mask) + 1]
205
cdef unsigned int text_offset
206
cdef unsigned int hash_val
207
cdef unsigned int hash_offset
208
if self._index == NULL:
212
while get_hash_offset(self._index, pos, &hash_offset):
213
hash_list.append(int(hash_offset))
217
while get_entry_summary(self._index, pos, &text_offset, &hash_val):
218
# Map back using 'int' so that we don't get Long everywhere, when
219
# almost everything is <2**31.
220
val = tuple(map(int, [text_offset, hash_val]))
221
entry_list.append(val)
223
return hash_list, entry_list
121
225
def add_delta_source(self, delta, unadded_bytes):
122
226
"""Add a new delta to the source texts.
146
251
src.size = c_delta_size
147
252
src.agg_offset = self._source_offset + unadded_bytes
149
index = create_delta_index_from_delta(src, self._index)
254
res = create_delta_index_from_delta(src, self._index, &index)
256
raise _translate_delta_failure(res)
150
257
self._source_offset = src.agg_offset + src.size
258
if index != self._index:
152
259
free_delta_index(self._index)
153
260
self._index = index
158
265
:param source: The text in question, this must be a byte string
159
266
:param unadded_bytes: Assume there are this many bytes that didn't get
160
267
added between this source and the end of the previous source.
268
:param max_pointers: Add no more than this many entries to the index.
269
By default, we sample every 16 bytes, if that would require more
270
than max_entries, we will reduce the sampling rate.
271
A value of 0 means unlimited, None means use the default limit.
162
273
cdef char *c_source
163
274
cdef Py_ssize_t c_source_size
164
275
cdef delta_index *index
276
cdef delta_result res
165
277
cdef unsigned int source_location
166
278
cdef source_info *src
167
279
cdef unsigned int num_indexes
280
cdef int max_num_entries
169
282
if not PyString_CheckExact(source):
170
283
raise TypeError('source is not a str')
187
300
# We delay creating the index on the first insert
188
301
if source_location != 0:
190
index = create_delta_index(src, self._index)
303
res = create_delta_index(src, self._index, &index,
304
self._max_bytes_to_index)
306
raise _translate_delta_failure(res)
307
if index != self._index:
192
308
free_delta_index(self._index)
193
309
self._index = index
195
311
cdef _populate_first_index(self):
196
312
cdef delta_index *index
313
cdef delta_result res
197
314
if len(self._sources) != 1 or self._index != NULL:
198
315
raise AssertionError('_populate_first_index should only be'
199
316
' called when we have a single source and no index yet')
201
# We know that self._index is already NULL, so whatever
202
# create_delta_index returns is fine
318
# We know that self._index is already NULL, so create_delta_index
319
# will always create a new index unless there's a malloc failure
204
self._index = create_delta_index(&self._source_infos[0], NULL)
205
assert self._index != NULL
321
res = create_delta_index(&self._source_infos[0], NULL, &index,
322
self._max_bytes_to_index)
324
raise _translate_delta_failure(res)
207
327
cdef _expand_sources(self):
208
328
raise RuntimeError('if we move self._source_infos, then we need to'
237
358
# allocate the bytes into the final string
238
359
c_max_delta_size = max_delta_size
240
delta = create_delta(self._index,
242
&delta_size, c_max_delta_size)
361
res = create_delta(self._index, target, target_size,
362
&delta_size, c_max_delta_size, &delta)
245
365
result = PyString_FromStringAndSize(<char *>delta, delta_size)
367
elif res != DELTA_SIZE_TOO_BIG:
368
raise _translate_delta_failure(res)
350
472
# Copy instruction
351
473
data = _decode_copy_instruction(data, cmd, &cp_off, &cp_size)
352
474
if (cp_off + cp_size < cp_size or
353
cp_off + cp_size > source_size or
475
cp_off + cp_size > <unsigned int>source_size or
476
cp_size > <unsigned int>size):
357
479
memcpy(out, source + cp_off, cp_size)