/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
# Copyright (C) 2008 Canonical Limited.
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as published
# by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
# 

"""Compiled extensions for doing compression."""

cdef extern from *:
    ctypedef unsigned long size_t
    void * malloc(size_t)
    void * realloc(void *, size_t)
    void free(void *)

cdef extern from "Python.h":
    struct _PyObject:
        pass
    ctypedef _PyObject PyObject
    PyObject *PySequence_Fast(object, char *) except NULL
    Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
    PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
    PyObject *PyList_GET_ITEM(object, Py_ssize_t)
    int PyList_Append(object, object) except -1
    long PyObject_Hash(PyObject *) except -1
    # We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
    # if there is an exception *for* us.
    int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1
    int PyObject_Not(PyObject *) except -1
    void Py_DECREF(PyObject *)
    void Py_INCREF(PyObject *)

cdef enum _raw_line_flags:
    INDEXED  = 0x01

cdef struct _raw_line:
    long hash              # Cached form of the hash for this entry
    Py_ssize_t hash_offset # The location in the hash table for this object
    Py_ssize_t next_line_index # Next line which is equivalent to this one
    int flags              # status flags
    PyObject *data         # Raw pointer to the original line


cdef struct _hash_bucket:
    Py_ssize_t line_index # First line in the left side for this bucket
    Py_ssize_t count      # Number of equivalent lines, DO we even need this?


cdef Py_ssize_t SENTINEL
SENTINEL = -1


cdef void *safe_malloc(size_t count) except NULL:
    cdef void *result
    result = malloc(count)
    if result == NULL:
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
    return result


cdef void *safe_realloc(void * old, size_t count) except NULL:
    cdef void *result
    result = realloc(old, count)
    if result == NULL:
        raise MemoryError('Failed to reallocate to %d bytes of memory'
                          % (count,))
    return result


cdef int safe_free(void **val) except -1:
    assert val != NULL
    if val[0] != NULL:
        free(val[0])
        val[0] = NULL


cdef class EquivalenceTable:
    """This tracks equivalencies between lists of hashable objects.

    :ivar lines: The 'static' lines that will be preserved between runs.
    """

    cdef readonly object lines
    cdef readonly object _right_lines
    cdef Py_ssize_t _hashtable_size
    cdef Py_ssize_t _hashtable_bitmask
    cdef _hash_bucket *_hashtable
    cdef _raw_line *_raw_lines
    cdef Py_ssize_t _len_lines

    def __init__(self, lines):
        self.lines = list(lines)
        self._right_lines = None
        self._hashtable_size = 0
        self._hashtable = NULL
        self._raw_lines = NULL
        self._len_lines = 0
        self._lines_to_raw_lines(lines)
        self._build_hash_table()

    def __dealloc__(self):
        safe_free(<void**>&self._hashtable)
        safe_free(<void**>&self._raw_lines)

    cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
        """Convert a single PyObject into a raw line."""
        raw_line.hash = PyObject_Hash(line)
        raw_line.next_line_index = SENTINEL
        raw_line.hash_offset = SENTINEL
        raw_line.flags = INDEXED
        raw_line.data = line
        
    cdef int _lines_to_raw_lines(self, object lines) except -1:
        """Load a sequence of objects into the _raw_line format"""
        cdef Py_ssize_t count, i
        cdef PyObject *seq
        cdef PyObject *item
        cdef _raw_line *raw

        # Do we want to use PySequence_Fast, or just assume that it is a list
        # Also, do we need to decref the return value?
        # http://www.python.org/doc/current/api/sequence.html
        seq = PySequence_Fast(lines, "expected a sequence")
        try:
            count = PySequence_Fast_GET_SIZE(seq)
            if count == 0:
                return 0
            raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
            safe_free(<void**>&self._raw_lines)
            self._raw_lines = raw
            self._len_lines = count

            for i from 0 <= i < count:
                item = PySequence_Fast_GET_ITEM(seq, i)
                # NB: We don't Py_INCREF the data pointer, because we know we
                #     maintain a pointer to the item in self.lines or
                #     self._right_lines
                # TODO: Do we even need to track a data pointer here? It would
                #       be less memory, and we *could* just look it up in the
                #       appropriate line list.
                self._line_to_raw_line(item, &raw[i])
        finally:
            # TODO: Unfortunately, try/finally always generates a compiler
            #       warning about a possibly unused variable :(
            Py_DECREF(seq)
        return count

    cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
        """Determine the smallest hash size that can reasonably fit the data.
        
        :param needed: The number of entries we might be inserting into
            the hash (assuming there are no duplicate lines.)
        :return: The number of hash table entries to use. This will always be a
            power of two.
        """
        cdef Py_ssize_t hash_size
        cdef Py_ssize_t min_size

        # TODO: Another alternative would be to actually count how full the
        #       hash-table is, and decide if we need to grow it based on
        #       density. That can take into account duplicated lines. Though if
        #       we compress well, there should be a minimal amount of
        #       duplicated lines in the output.

        # At the bare minimum, we could fit all entries into a 'needed'
        # size hash table. However, any collision would then have a long way to
        # traverse before it could find a 'free' slot.
        # So we set the minimum size to give us 33% empty slots.
        min_size = <Py_ssize_t>(needed * 1.5)
        hash_size = 1
        while hash_size < min_size:
            hash_size = hash_size << 1
        return hash_size

    def _py_compute_minimum_hash_size(self, needed):
        """Expose _compute_minimum_hash_size to python for testing."""
        return self._compute_minimum_hash_size(needed)

    cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
        """Determine a reasonable hash size, assuming some room for growth.
        
        :param needed: The number of entries we might be inserting into
            the hash (assuming there are no duplicate lines.)
        :return: The number of hash table entries to use. This will always be a
            power of two.
        """
        cdef Py_ssize_t hash_size
        cdef Py_ssize_t min_size

        # We start off with a 8k hash (after doubling), because there isn't a
        # lot of reason to go smaller than that (this class isn't one you'll be
        # instantiating thousands of, and you are always likely to grow here.)
        hash_size = 4096
        while hash_size < needed:
            hash_size = hash_size << 1
        # And we always give at least 2x blank space
        hash_size = hash_size << 1
        return hash_size

    def _py_compute_recommended_hash_size(self, needed):
        """Expose _compute_recommended_hash_size to python for testing."""
        return self._compute_recommended_hash_size(needed)

    cdef int _build_hash_table(self) except -1:
        """Build the hash table 'from scratch'."""
        cdef Py_ssize_t hash_size
        cdef Py_ssize_t hash_bitmask
        cdef Py_ssize_t i
        cdef _raw_line *cur_line 
        cdef Py_ssize_t hash_offset
        cdef _hash_bucket *cur_bucket
        cdef _hash_bucket *new_hashtable

        # Hash size is a power of 2
        hash_size = self._compute_recommended_hash_size(self._len_lines)

        new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
                                                   hash_size)
        safe_free(<void**>&self._hashtable)
        self._hashtable = new_hashtable

        self._hashtable_size = hash_size
        for i from 0 <= i < hash_size:
            self._hashtable[i].line_index = SENTINEL
            self._hashtable[i].count = 0

        # Turn the hash size into a bitmask
        self._hashtable_bitmask = hash_size - 1

        # Iterate backwards, because it makes it easier to insert items into
        # the hash (you just change the head pointer, and everything else keeps
        # pointing to the same location).
        for i from self._len_lines > i >= 0:
            cur_line = self._raw_lines + i
            if not (cur_line.flags & INDEXED):
                continue
            hash_offset = self._find_hash_position(cur_line)

            # Point this line to the location in the hash table
            cur_line.hash_offset = hash_offset
            # And make this line the head of the hash table
            cur_bucket = self._hashtable + hash_offset
            cur_line.next_line_index = cur_bucket.line_index
            cur_bucket.line_index = i
            cur_bucket.count += 1

    cdef int _extend_hash_table_raw(self, PyObject *seq_index) except -1:
        cdef Py_ssize_t new_count
        cdef Py_ssize_t new_total_len
        cdef Py_ssize_t old_len
        cdef PyObject *item
        cdef PyObject *should_index
        cdef Py_ssize_t i
        cdef Py_ssize_t line_index
        cdef _hash_bucket *cur_bucket
        cdef _raw_line *cur_line
        cdef _raw_line *next_line
        cdef Py_ssize_t hash_offset
        cdef PyObject *local_lines

        old_len = self._len_lines
        new_count = PySequence_Fast_GET_SIZE(seq_index) 
        new_total_len = new_count + self._len_lines
        self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
                                new_total_len * sizeof(_raw_line))
        self._len_lines = new_total_len
        # Now that we have enough space, start adding the new lines
        # into the array. These are done in forward order.
        for i from 0 <= i < new_count:
            line_index = i + old_len
            cur_line = self._raw_lines + line_index
            item = PyList_GET_ITEM(self.lines, line_index)
            self._line_to_raw_line(item, cur_line)
            should_index = PySequence_Fast_GET_ITEM(seq_index, i)
            if PyObject_Not(should_index):
                cur_line.flags &= ~(<int>INDEXED)
                continue
            hash_offset = self._find_hash_position(cur_line)

            # Point this line to the location in the hash table
            cur_line.hash_offset = hash_offset

            # Make this line the tail of the hash table
            cur_bucket = self._hashtable + hash_offset
            cur_bucket.count += 1
            if cur_bucket.line_index == SENTINEL:
                cur_bucket.line_index = line_index
                continue
            # We need to track through the pointers and insert this at
            # the end
            next_line = self._raw_lines + cur_bucket.line_index
            while next_line.next_line_index != SENTINEL:
                next_line = self._raw_lines + next_line.next_line_index
            next_line.next_line_index = line_index

    cdef int _extend_hash_table(self, object index) except -1:
        """Add the last N entries in self.lines to the hash table.

        :param index: A sequence that declares whether each node should be
            INDEXED or not.
        """
        cdef PyObject *seq_index

        seq_index = PySequence_Fast(index, "expected a sequence for index")
        try:
            self._extend_hash_table_raw(seq_index)
        finally:
            Py_DECREF(seq_index)

    cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
        """Find the node in the hash which defines this line.

        Each bucket in the hash table points at exactly 1 equivalent line. If 2
        objects would collide, we just increment to the next bucket until we
        get to an empty bucket that is either empty or exactly matches this
        object.

        :return: The offset in the hash table for this entry
        """
        cdef Py_ssize_t location
        cdef _raw_line *ref_line
        cdef Py_ssize_t ref_index
        cdef int compare_result

        location = line.hash & self._hashtable_bitmask
        ref_index = self._hashtable[location].line_index
        while ref_index != SENTINEL:
            ref_line = self._raw_lines + ref_index
            if (ref_line.hash == line.hash):
                PyObject_Cmp(ref_line.data, line.data, &compare_result)
                if compare_result == 0:
                    break
            location = (location + 1) & self._hashtable_bitmask
            ref_index = self._hashtable[location].line_index
        return location

    def _py_find_hash_position(self, line):
        """Used only for testing.

        Return the location where this fits in the hash table
        """
        cdef _raw_line raw_line

        self._line_to_raw_line(<PyObject *>line, &raw_line)
        return self._find_hash_position(&raw_line)
        
    def _inspect_left_lines(self):
        """Used only for testing.

        :return: None if _raw_lines is NULL,
            else [(object, hash_val, hash_loc, next_val)] for each node in raw
                  lines.
        """
        cdef Py_ssize_t i

        if self._raw_lines == NULL:
            return None

        result = []
        for i from 0 <= i < self._len_lines:
            PyList_Append(result,
                          (<object>self._raw_lines[i].data,
                           self._raw_lines[i].hash,
                           self._raw_lines[i].hash_offset,
                           self._raw_lines[i].next_line_index,
                           ))
        return result

    def _inspect_hash_table(self):
        """Used only for testing.

        This iterates the hash table, and returns 'interesting' entries.
        :return: (total_size, [(offset, line_index, count)] for entries that
            are not empty.
        """
        cdef int i

        interesting = []
        for i from 0 <= i < self._hashtable_size:
            if self._hashtable[i].line_index != SENTINEL:
                PyList_Append(interesting,
                              (i, self._hashtable[i].line_index,
                               self._hashtable[i].count))
        return (self._hashtable_size, interesting)

    def get_matches(self, line):
        """Return the lines which match the line in right."""
        cdef Py_ssize_t hash_offset
        cdef _raw_line raw_line
        cdef _hash_bucket cur_bucket
        cdef Py_ssize_t cur_line_idx

        self._line_to_raw_line(<PyObject *>line, &raw_line)
        hash_offset = self._find_hash_position(&raw_line)
        cur_bucket = self._hashtable[hash_offset]
        cur_line_idx = cur_bucket.line_index
        if cur_line_idx == SENTINEL:
            return None
        result = []
        while cur_line_idx != SENTINEL:
            PyList_Append(result, cur_line_idx)
            cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
        assert len(result) == cur_bucket.count
        return result

    def _get_matching_lines(self):
        """Return a dictionary showing matching lines."""
        matching = {}
        for line in self.lines:
            matching[line] = self.get_matches(line)
        return matching

    def get_idx_matches(self, right_idx):
        """Return the left lines matching the right line at the given offset."""
        line = self._right_lines[right_idx]
        return self.get_matches(line)

    def extend_lines(self, lines, index):
        """Add more lines to the left-lines list.

        :param lines: A list of lines to add
        :param index: A True/False for each node to define if it should be
            indexed.
        """
        cdef Py_ssize_t orig_len
        cdef Py_ssize_t min_new_hash_size
        assert len(lines) == len(index)
        min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
                                                            len(lines))
        if self._hashtable_size >= min_new_hash_size:
            # Just add the new lines, don't bother resizing the hash table
            self.lines.extend(lines)
            self._extend_hash_table(index)
            return
        orig_len = len(self.lines)
        self.lines.extend(lines)
        self._lines_to_raw_lines(self.lines)
        for idx, val in enumerate(index):
            if not val:
                self._raw_lines[orig_len + idx].flags &= ~(<int>INDEXED)
        self._build_hash_table()

    def set_right_lines(self, lines):
        """Set the lines we will be matching against."""
        self._right_lines = lines


def _get_longest_match(equivalence_table, pos, max_pos, locations):
    """Get the longest possible match for the current position."""
    cdef EquivalenceTable eq
    cdef int cpos

    eq = equivalence_table
    cpos = pos
    range_start = pos
    range_len = 0
    copy_ends = None

    while pos < max_pos:
        if locations is None:
            locations = equivalence_table.get_idx_matches(pos)
        if locations is None:
            # No more matches, just return whatever we have, but we know that
            # this last position is not going to match anything
            pos = pos + 1
            break
        else:
            if copy_ends is None:
                # We are starting a new range
                copy_ends = []
                for loc in locations:
                    copy_ends.append(loc + 1)
                range_len = 1
                locations = None # Consumed
            else:
                # We are currently in the middle of a match
                next_locations = set(copy_ends).intersection(locations)
                if len(next_locations):
                    # range continues
                    copy_ends = []
                    for loc in next_locations:
                        copy_ends.append(loc + 1)
                    range_len = range_len + 1
                    locations = None # Consumed
                else:
                    # But we are done with this match, we should be
                    # starting a new one, though. We will pass back 'locations'
                    # so that we don't have to do another lookup.
                    break
        pos = pos + 1
    if copy_ends is None:
        return None, pos, locations
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations