/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
1
# Copyright (C) 2008 Canonical Limited.
2
# 
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
6
# 
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
# GNU General Public License for more details.
11
# 
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
15
# 
16
17
"""Compiled extensions for doing compression."""
18
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
19
cdef extern from *:
20
    ctypedef unsigned long size_t
21
    void * malloc(size_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
22
    void * realloc(void *, size_t)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
23
    void free(void *)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
24
    void memcpy(void *, void *, size_t)
25
26
cdef extern from "delta.h":
27
    struct delta_index:
28
        unsigned long memsize
29
        void *src_buf
30
        unsigned long src_size
31
        unsigned int hash_mask
32
        # struct index_entry *hash[]
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
33
    delta_index * create_delta_index(void *buf, unsigned long bufsize, unsigned
34
                                     long agg_src_offset)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
35
    void free_delta_index(delta_index *index)
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
36
    void *create_delta(delta_index **indexes,
37
             unsigned int num_indexes,
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
38
             void *buf, unsigned long bufsize,
39
             unsigned long *delta_size, unsigned long max_delta_size)
40
    unsigned long get_delta_hdr_size(unsigned char **datap,
41
                                     unsigned char *top)
42
    Py_ssize_t DELTA_SIZE_MIN
0.23.7 by John Arbash Meinel
Add a apply_delta2 function, just in case it matters.
43
    void *patch_delta(void *src_buf, unsigned long src_size,
44
                      void *delta_buf, unsigned long delta_size,
45
                      unsigned long *dst_size)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
46
47
cdef extern from "Python.h":
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
48
    int PyString_CheckExact(object)
49
    char * PyString_AS_STRING(object)
50
    Py_ssize_t PyString_GET_SIZE(object)
51
    object PyString_FromStringAndSize(char *, Py_ssize_t)
52
53
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
54
cdef void *safe_malloc(size_t count) except NULL:
55
    cdef void *result
56
    result = malloc(count)
57
    if result == NULL:
58
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
59
    return result
60
61
62
cdef void *safe_realloc(void * old, size_t count) except NULL:
63
    cdef void *result
64
    result = realloc(old, count)
65
    if result == NULL:
66
        raise MemoryError('Failed to reallocate to %d bytes of memory'
67
                          % (count,))
68
    return result
69
70
71
cdef int safe_free(void **val) except -1:
72
    assert val != NULL
73
    if val[0] != NULL:
74
        free(val[0])
75
        val[0] = NULL
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
76
0.23.17 by John Arbash Meinel
Create a wrapper function, so that lsprof will properly attribute time spent.
77
def make_delta_index(source):
78
    return DeltaIndex(source)
79
80
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
81
cdef class DeltaIndex:
82
0.23.40 by John Arbash Meinel
Add a comment why we aren't using the list type for _sources
83
    # We need Pyrex 0.9.8+ to understand a 'list' definition, and this object
84
    # isn't performance critical
85
    # cdef readonly list _sources
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
86
    cdef readonly object _sources
87
    cdef delta_index **_indexes
88
    cdef readonly unsigned int _num_indexes
89
    cdef readonly unsigned int _max_num_indexes
0.23.32 by John Arbash Meinel
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
90
    cdef public unsigned long _source_offset
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
91
92
    def __repr__(self):
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
93
        return '%s(%d, %d, %d)' % (self.__class__.__name__,
94
            len(self._sources), self._source_offset,
95
            self._num_indexes)
96
97
    def __init__(self, source=None):
98
        self._sources = []
99
        self._max_num_indexes = 1024
100
        self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*)
101
                                                   * self._max_num_indexes)
102
        self._num_indexes = 0
103
        self._source_offset = 0
104
105
        if source is not None:
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
106
            self.add_source(source, 0)
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
107
108
    def __dealloc__(self):
109
        self._ensure_no_indexes()
110
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
111
    def add_source(self, source, unadded_bytes):
112
        """Add a new bit of source text to the delta indexes.
113
114
        :param source: The text in question, this must be a byte string
115
        :param unadded_bytes: Assume there are this many bytes that didn't get
116
            added between this source and the end of the previous source.
117
        """
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
118
        cdef char *c_source
119
        cdef Py_ssize_t c_source_size
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
120
        cdef delta_index *index
121
        cdef unsigned int num_indexes
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
122
        cdef unsigned long agg_src_offset
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
123
124
        if not PyString_CheckExact(source):
125
            raise TypeError('source is not a str')
126
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
127
        self._sources.append(source)
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
128
        c_source = PyString_AS_STRING(source)
129
        c_source_size = PyString_GET_SIZE(source)
130
131
        # TODO: Are usage is ultimately going to be different than the one that
132
        #       was originally designed. Specifically, we are going to want to
133
        #       be able to update the index by hashing future data. It should
134
        #       fit just fine into the structure. But for now, we just wrap
135
        #       create_delta_index (For example, we could always reserve enough
136
        #       space to hash a 4MB string, etc.)
0.23.26 by John Arbash Meinel
We now start to make use of the ability to extend the delta index
137
        agg_src_offset = self._source_offset + unadded_bytes
138
        index = create_delta_index(c_source, c_source_size, agg_src_offset)
139
        self._source_offset = agg_src_offset + c_source_size
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
140
        if index != NULL:
141
            num_indexes = self._num_indexes + 1
142
            if num_indexes >= self._max_num_indexes:
143
                self._expand_indexes()
144
            self._indexes[self._num_indexes] = index
145
            self._num_indexes = num_indexes
146
147
    cdef _expand_indexes(self):
148
        self._max_num_indexes = self._max_num_indexes * 2
149
        self._indexes = <delta_index **>safe_realloc(self._indexes,
150
                                                sizeof(delta_index *)
151
                                                * self._max_num_indexes)
152
153
    cdef _ensure_no_indexes(self):
154
        cdef int i
155
156
        if self._indexes != NULL:
157
            for i from 0 <= i < self._num_indexes:
158
                free_delta_index(self._indexes[i])
159
                self._indexes[i] = NULL
160
            free(self._indexes)
161
            self._indexes = NULL
162
            self._max_num_indexes = 0
163
            self._num_indexes = 0
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
164
165
    def make_delta(self, target_bytes, max_delta_size=0):
166
        """Create a delta from the current source to the target bytes."""
167
        cdef char *target
168
        cdef Py_ssize_t target_size
169
        cdef void * delta
170
        cdef unsigned long delta_size
171
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
172
        if self._num_indexes == 0:
0.23.15 by John Arbash Meinel
Handle when self._index is NULL, mostly because the source text was the empty strig.
173
            return None
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
174
175
        if not PyString_CheckExact(target_bytes):
176
            raise TypeError('target is not a str')
177
178
        target = PyString_AS_STRING(target_bytes)
179
        target_size = PyString_GET_SIZE(target_bytes)
180
181
        # TODO: inline some of create_delta so we at least don't have to double
182
        #       malloc, and can instead use PyString_FromStringAndSize, to
183
        #       allocate the bytes into the final string
0.23.25 by John Arbash Meinel
We are now able to add multiple sources to the delta generator.
184
        delta = create_delta(self._indexes, self._num_indexes,
185
                             target, target_size,
0.23.14 by John Arbash Meinel
Implement a DeltaIndex wrapper.
186
                             &delta_size, max_delta_size)
187
        result = None
188
        if delta:
189
            result = PyString_FromStringAndSize(<char *>delta, delta_size)
190
            free(delta)
191
        return result
192
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
193
194
def make_delta(source_bytes, target_bytes):
195
    """Create a delta from source_bytes => target_bytes."""
196
    cdef char *source
197
    cdef Py_ssize_t source_size
198
    cdef char *target
199
    cdef Py_ssize_t target_size
200
    cdef delta_index *index
201
    cdef void * delta
202
    cdef unsigned long delta_size
203
    cdef unsigned long max_delta_size
204
205
    max_delta_size = 0 # Unlimited
206
207
    if not PyString_CheckExact(source_bytes):
208
        raise TypeError('source is not a str')
209
    if not PyString_CheckExact(target_bytes):
210
        raise TypeError('target is not a str')
211
212
    source = PyString_AS_STRING(source_bytes)
213
    source_size = PyString_GET_SIZE(source_bytes)
214
    target = PyString_AS_STRING(target_bytes)
215
    target_size = PyString_GET_SIZE(target_bytes)
216
217
    result = None
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
218
    index = create_delta_index(source, source_size, 0)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
219
    if index != NULL:
0.23.24 by John Arbash Meinel
Change the code so that we can pass in multiple sources to match against.
220
        delta = create_delta(&index, 1, target, target_size,
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
221
                             &delta_size, max_delta_size)
222
        free_delta_index(index);
223
        if delta:
224
            result = PyString_FromStringAndSize(<char *>delta, delta_size)
225
            free(delta)
226
    return result
227
228
229
def apply_delta(source_bytes, delta_bytes):
230
    """Apply a delta generated by make_delta to source_bytes."""
231
    cdef char *source
232
    cdef Py_ssize_t source_size
233
    cdef char *delta
234
    cdef Py_ssize_t delta_size
235
    cdef unsigned char *data, *top
236
    cdef unsigned char *dst_buf, *out, cmd
237
    cdef Py_ssize_t size
238
    cdef unsigned long cp_off, cp_size
239
240
    if not PyString_CheckExact(source_bytes):
241
        raise TypeError('source is not a str')
242
    if not PyString_CheckExact(delta_bytes):
243
        raise TypeError('delta is not a str')
244
245
    source = PyString_AS_STRING(source_bytes)
246
    source_size = PyString_GET_SIZE(source_bytes)
247
    delta = PyString_AS_STRING(delta_bytes)
248
    delta_size = PyString_GET_SIZE(delta_bytes)
249
250
    # Code taken from patch-delta.c, only brought here to give better error
251
    # handling, and to avoid double allocating memory
252
    if (delta_size < DELTA_SIZE_MIN):
253
        # XXX: Invalid delta block
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
254
        raise RuntimeError('delta_size %d smaller than min delta size %d'
255
                           % (delta_size, DELTA_SIZE_MIN))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
256
257
    data = <unsigned char *>delta
258
    top = data + delta_size
259
260
    # make sure the orig file size matches what we expect
261
    # XXX: gcc warns because data isn't defined as 'const'
262
    size = get_delta_hdr_size(&data, top)
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
263
    if (size > source_size):
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
264
        # XXX: mismatched source size
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
265
        raise RuntimeError('source size %d < expected source size %d'
266
                           % (source_size, size))
0.23.10 by John Arbash Meinel
Allowing the source bytes to be longer than expected.
267
    source_size = size
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
268
269
    # now the result size
270
    size = get_delta_hdr_size(&data, top)
271
    result = PyString_FromStringAndSize(NULL, size)
272
    dst_buf = <unsigned char*>PyString_AS_STRING(result)
273
    # XXX: The original code added a trailing null here, but this shouldn't be
274
    #      necessary when using PyString_FromStringAndSize
275
    # dst_buf[size] = 0
276
277
    out = dst_buf
278
    while (data < top):
279
        cmd = data[0]
280
        data = data + 1
281
        if (cmd & 0x80):
282
            cp_off = cp_size = 0
283
            if (cmd & 0x01):
284
                cp_off = data[0]
285
                data = data + 1
286
            if (cmd & 0x02):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
287
                cp_off = cp_off | (data[0] << 8)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
288
                data = data + 1
289
            if (cmd & 0x04):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
290
                cp_off = cp_off | (data[0] << 16)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
291
                data = data + 1
292
            if (cmd & 0x08):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
293
                cp_off = cp_off | (data[0] << 24)
294
                data = data + 1
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
295
            if (cmd & 0x10):
296
                cp_size = data[0]
297
                data = data + 1
298
            if (cmd & 0x20):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
299
                cp_size = cp_size | (data[0] << 8)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
300
                data = data + 1
301
            if (cmd & 0x40):
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
302
                cp_size = cp_size | (data[0] << 16)
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
303
                data = data + 1
304
            if (cp_size == 0):
305
                cp_size = 0x10000
306
            if (cp_off + cp_size < cp_size or
307
                cp_off + cp_size > source_size or
308
                cp_size > size):
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
309
                raise RuntimeError('Something wrong with:'
310
                    ' cp_off = %s, cp_size = %s'
311
                    ' source_size = %s, size = %s'
312
                    % (cp_off, cp_size, source_size, size))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
313
            memcpy(out, source + cp_off, cp_size)
314
            out = out + cp_size
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
315
            size = size - cp_size
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
316
        elif (cmd):
317
            if (cmd > size):
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
318
                raise RuntimeError('Insert instruction longer than remaining'
319
                    ' bytes: %d > %d' % (cmd, size))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
320
            memcpy(out, data, cmd)
321
            out = out + cmd
322
            data = data + cmd
0.24.1 by John Arbash Meinel
Make the groupcompress pyrex extension compatible with pyrex 0.9.6.4
323
            size = size - cmd
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
324
        else:
325
            # /*
326
            #  * cmd == 0 is reserved for future encoding
327
            #  * extensions. In the mean time we must fail when
328
            #  * encountering them (might be data corruption).
329
            #  */
330
            ## /* XXX: error("unexpected delta opcode 0"); */
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
331
            raise RuntimeError('Got delta opcode: 0, not supported')
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
332
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
333
    # /* sanity check */
334
    if (data != top or size != 0):
335
        ## /* XXX: error("delta replay has gone wild"); */
0.23.33 by John Arbash Meinel
Fix a bug when handling multiple large-range copies.
336
        raise RuntimeError('Did not extract the number of bytes we expected'
337
            ' we were left with %d bytes in "size", and top - data = %d'
338
            % (size, <int>(top - data)))
0.23.6 by John Arbash Meinel
Start stripping out the actual GroupCompressor
339
        return None
340
341
    # *dst_size = out - dst_buf;
342
    assert (out - dst_buf) == PyString_GET_SIZE(result)
343
    return result
0.23.7 by John Arbash Meinel
Add a apply_delta2 function, just in case it matters.
344
345
346
def apply_delta2(source_bytes, delta_bytes):
347
    """Apply a delta generated by make_delta to source_bytes."""
348
    # This defers to the patch-delta code rather than implementing it here
349
    # If this is faster, we can bring the memory allocation and error handling
350
    # into apply_delta(), and leave the primary loop in a separate C func.
351
    cdef char *source, *delta, *target
352
    cdef Py_ssize_t source_size, delta_size
353
    cdef unsigned long target_size
354
355
    if not PyString_CheckExact(source_bytes):
356
        raise TypeError('source is not a str')
357
    if not PyString_CheckExact(delta_bytes):
358
        raise TypeError('delta is not a str')
359
360
    source = PyString_AS_STRING(source_bytes)
361
    source_size = PyString_GET_SIZE(source_bytes)
362
    delta = PyString_AS_STRING(delta_bytes)
363
    delta_size = PyString_GET_SIZE(delta_bytes)
364
365
    target = <char *>patch_delta(source, source_size,
366
                                 delta, delta_size,
367
                                 &target_size)
368
    if target == NULL:
369
        return None
370
    result = PyString_FromStringAndSize(target, target_size)
371
    free(target)
372
    return result