/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
1
# Copyright (C) 2008 Canonical Limited.
2
# 
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
6
# 
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
# GNU General Public License for more details.
11
# 
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
15
# 
16
17
"""Compiled extensions for doing compression."""
18
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
19
cdef extern from *:
20
    ctypedef unsigned long size_t
21
    void * malloc(size_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
22
    void * realloc(void *, size_t)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
23
    void free(void *)
24
25
cdef extern from "Python.h":
26
    struct _PyObject:
27
        pass
28
    ctypedef _PyObject PyObject
29
    PyObject *PySequence_Fast(object, char *) except NULL
30
    Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
31
    PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
32
    PyObject *PyList_GET_ITEM(object, Py_ssize_t)
33
    int PyList_Append(object, object) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
34
    long PyObject_Hash(PyObject *) except -1
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
35
    # We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
36
    # if there is an exception *for* us.
37
    int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
38
    int PyObject_Not(PyObject *) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
39
    void Py_DECREF(PyObject *)
40
    void Py_INCREF(PyObject *)
41
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
42
cdef enum _raw_line_flags:
43
    INDEXED  = 0x01
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
44
45
cdef struct _raw_line:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
46
    long hash              # Cached form of the hash for this entry
47
    Py_ssize_t hash_offset # The location in the hash table for this object
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
48
    Py_ssize_t next_line_index # Next line which is equivalent to this one
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
49
    int flags              # status flags
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
50
    PyObject *data         # Raw pointer to the original line
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
51
52
53
cdef struct _hash_bucket:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
54
    Py_ssize_t line_index # First line in the left side for this bucket
55
    Py_ssize_t count      # Number of equivalent lines, DO we even need this?
56
57
58
cdef Py_ssize_t SENTINEL
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
59
SENTINEL = -1
60
61
62
cdef void *safe_malloc(size_t count) except NULL:
63
    cdef void *result
64
    result = malloc(count)
65
    if result == NULL:
66
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
    return result
68
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
69
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
71
    cdef void *result
72
    result = realloc(old, count)
73
    if result == NULL:
74
        raise MemoryError('Failed to reallocate to %d bytes of memory'
75
                          % (count,))
76
    return result
77
78
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
79
cdef int safe_free(void **val) except -1:
80
    assert val != NULL
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
81
    if val[0] != NULL:
82
        free(val[0])
83
        val[0] = NULL
84
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
85
86
cdef class EquivalenceTable:
87
    """This tracks equivalencies between lists of hashable objects.
88
89
    :ivar lines: The 'static' lines that will be preserved between runs.
90
    """
91
92
    cdef readonly object lines
93
    cdef readonly object _right_lines
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
94
    cdef Py_ssize_t _hashtable_size
95
    cdef Py_ssize_t _hashtable_bitmask
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
96
    cdef _hash_bucket *_hashtable
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
97
    cdef _raw_line *_raw_lines
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
98
    cdef Py_ssize_t _len_lines
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
99
100
    def __init__(self, lines):
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
101
        self.lines = list(lines)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
102
        self._right_lines = None
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
103
        self._hashtable_size = 0
104
        self._hashtable = NULL
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
105
        self._raw_lines = NULL
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
106
        self._len_lines = 0
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
107
        self._lines_to_raw_lines(lines)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
108
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
109
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
110
    def __dealloc__(self):
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
111
        safe_free(<void**>&self._hashtable)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
112
        safe_free(<void**>&self._raw_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
114
    cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
115
        """Convert a single PyObject into a raw line."""
116
        raw_line.hash = PyObject_Hash(line)
117
        raw_line.next_line_index = SENTINEL
118
        raw_line.hash_offset = SENTINEL
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
119
        raw_line.flags = INDEXED
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
120
        raw_line.data = line
121
        
122
    cdef int _lines_to_raw_lines(self, object lines) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
123
        """Load a sequence of objects into the _raw_line format"""
124
        cdef Py_ssize_t count, i
125
        cdef PyObject *seq
126
        cdef PyObject *item
127
        cdef _raw_line *raw
128
129
        # Do we want to use PySequence_Fast, or just assume that it is a list
130
        # Also, do we need to decref the return value?
131
        # http://www.python.org/doc/current/api/sequence.html
132
        seq = PySequence_Fast(lines, "expected a sequence")
133
        try:
134
            count = PySequence_Fast_GET_SIZE(seq)
135
            if count == 0:
136
                return 0
137
            raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
138
            safe_free(<void**>&self._raw_lines)
139
            self._raw_lines = raw
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
140
            self._len_lines = count
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
141
142
            for i from 0 <= i < count:
143
                item = PySequence_Fast_GET_ITEM(seq, i)
144
                # NB: We don't Py_INCREF the data pointer, because we know we
145
                #     maintain a pointer to the item in self.lines or
146
                #     self._right_lines
147
                # TODO: Do we even need to track a data pointer here? It would
148
                #       be less memory, and we *could* just look it up in the
149
                #       appropriate line list.
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
150
                self._line_to_raw_line(item, &raw[i])
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
151
        finally:
152
            # TODO: Unfortunately, try/finally always generates a compiler
153
            #       warning about a possibly unused variable :(
154
            Py_DECREF(seq)
155
        return count
156
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
157
    cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
158
        """Determine the smallest hash size that can reasonably fit the data.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
159
        
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
160
        :param needed: The number of entries we might be inserting into
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
161
            the hash (assuming there are no duplicate lines.)
162
        :return: The number of hash table entries to use. This will always be a
163
            power of two.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
164
        """
165
        cdef Py_ssize_t hash_size
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
166
        cdef Py_ssize_t min_size
167
168
        # TODO: Another alternative would be to actually count how full the
169
        #       hash-table is, and decide if we need to grow it based on
170
        #       density. That can take into account duplicated lines. Though if
171
        #       we compress well, there should be a minimal amount of
172
        #       duplicated lines in the output.
173
174
        # At the bare minimum, we could fit all entries into a 'needed'
175
        # size hash table. However, any collision would then have a long way to
176
        # traverse before it could find a 'free' slot.
177
        # So we set the minimum size to give us 33% empty slots.
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
178
        min_size = <Py_ssize_t>(needed * 1.5)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
179
        hash_size = 1
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
180
        while hash_size < min_size:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
181
            hash_size = hash_size << 1
182
        return hash_size
183
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
184
    def _py_compute_minimum_hash_size(self, needed):
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
185
        """Expose _compute_minimum_hash_size to python for testing."""
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
186
        return self._compute_minimum_hash_size(needed)
187
188
    cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
189
        """Determine a reasonable hash size, assuming some room for growth.
190
        
191
        :param needed: The number of entries we might be inserting into
192
            the hash (assuming there are no duplicate lines.)
193
        :return: The number of hash table entries to use. This will always be a
194
            power of two.
195
        """
196
        cdef Py_ssize_t hash_size
197
        cdef Py_ssize_t min_size
198
199
        # We start off with a 8k hash (after doubling), because there isn't a
200
        # lot of reason to go smaller than that (this class isn't one you'll be
201
        # instantiating thousands of, and you are always likely to grow here.)
202
        hash_size = 4096
203
        while hash_size < needed:
204
            hash_size = hash_size << 1
205
        # And we always give at least 2x blank space
206
        hash_size = hash_size << 1
207
        return hash_size
208
209
    def _py_compute_recommended_hash_size(self, needed):
210
        """Expose _compute_recommended_hash_size to python for testing."""
211
        return self._compute_recommended_hash_size(needed)
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
212
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
213
    cdef int _build_hash_table(self) except -1:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
214
        """Build the hash table 'from scratch'."""
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
215
        cdef Py_ssize_t hash_size
216
        cdef Py_ssize_t hash_bitmask
217
        cdef Py_ssize_t i
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
218
        cdef _raw_line *cur_line 
219
        cdef Py_ssize_t hash_offset
220
        cdef _hash_bucket *cur_bucket
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
221
        cdef _hash_bucket *new_hashtable
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
222
223
        # Hash size is a power of 2
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
224
        hash_size = self._compute_recommended_hash_size(self._len_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
225
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
226
        new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
227
                                                   hash_size)
228
        safe_free(<void**>&self._hashtable)
229
        self._hashtable = new_hashtable
230
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
231
        self._hashtable_size = hash_size
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
232
        for i from 0 <= i < hash_size:
233
            self._hashtable[i].line_index = SENTINEL
234
            self._hashtable[i].count = 0
235
236
        # Turn the hash size into a bitmask
237
        self._hashtable_bitmask = hash_size - 1
238
239
        # Iterate backwards, because it makes it easier to insert items into
240
        # the hash (you just change the head pointer, and everything else keeps
241
        # pointing to the same location).
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
242
        for i from self._len_lines > i >= 0:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
243
            cur_line = self._raw_lines + i
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
244
            if not (cur_line.flags & INDEXED):
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
245
                continue
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
246
            hash_offset = self._find_hash_position(cur_line)
247
248
            # Point this line to the location in the hash table
249
            cur_line.hash_offset = hash_offset
250
            # And make this line the head of the hash table
251
            cur_bucket = self._hashtable + hash_offset
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
252
            cur_line.next_line_index = cur_bucket.line_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
253
            cur_bucket.line_index = i
254
            cur_bucket.count += 1
255
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
256
    cdef int _extend_hash_table_raw(self, PyObject *seq_index) except -1:
257
        cdef Py_ssize_t new_count
258
        cdef Py_ssize_t new_total_len
259
        cdef Py_ssize_t old_len
260
        cdef PyObject *item
261
        cdef PyObject *should_index
262
        cdef Py_ssize_t i
263
        cdef Py_ssize_t line_index
264
        cdef _hash_bucket *cur_bucket
265
        cdef _raw_line *cur_line
266
        cdef _raw_line *next_line
267
        cdef Py_ssize_t hash_offset
268
        cdef PyObject *local_lines
269
270
        old_len = self._len_lines
271
        new_count = PySequence_Fast_GET_SIZE(seq_index) 
272
        new_total_len = new_count + self._len_lines
273
        self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
274
                                new_total_len * sizeof(_raw_line))
275
        self._len_lines = new_total_len
276
        # Now that we have enough space, start adding the new lines
277
        # into the array. These are done in forward order.
278
        for i from 0 <= i < new_count:
279
            line_index = i + old_len
280
            cur_line = self._raw_lines + line_index
281
            item = PyList_GET_ITEM(self.lines, line_index)
282
            self._line_to_raw_line(item, cur_line)
283
            should_index = PySequence_Fast_GET_ITEM(seq_index, i)
284
            if PyObject_Not(should_index):
285
                cur_line.flags &= ~(<int>INDEXED)
286
                continue
287
            hash_offset = self._find_hash_position(cur_line)
288
289
            # Point this line to the location in the hash table
290
            cur_line.hash_offset = hash_offset
291
292
            # Make this line the tail of the hash table
293
            cur_bucket = self._hashtable + hash_offset
294
            cur_bucket.count += 1
295
            if cur_bucket.line_index == SENTINEL:
296
                cur_bucket.line_index = line_index
297
                continue
298
            # We need to track through the pointers and insert this at
299
            # the end
300
            next_line = self._raw_lines + cur_bucket.line_index
301
            while next_line.next_line_index != SENTINEL:
302
                next_line = self._raw_lines + next_line.next_line_index
303
            next_line.next_line_index = line_index
304
305
    cdef int _extend_hash_table(self, object index) except -1:
306
        """Add the last N entries in self.lines to the hash table.
307
308
        :param index: A sequence that declares whether each node should be
309
            INDEXED or not.
310
        """
311
        cdef PyObject *seq_index
312
313
        seq_index = PySequence_Fast(index, "expected a sequence for index")
314
        try:
315
            self._extend_hash_table_raw(seq_index)
316
        finally:
317
            Py_DECREF(seq_index)
318
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
319
    cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
320
        """Find the node in the hash which defines this line.
321
322
        Each bucket in the hash table points at exactly 1 equivalent line. If 2
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
323
        objects would collide, we just increment to the next bucket until we
324
        get to an empty bucket that is either empty or exactly matches this
325
        object.
326
327
        :return: The offset in the hash table for this entry
328
        """
329
        cdef Py_ssize_t location
330
        cdef _raw_line *ref_line
331
        cdef Py_ssize_t ref_index
332
        cdef int compare_result
333
334
        location = line.hash & self._hashtable_bitmask
335
        ref_index = self._hashtable[location].line_index
336
        while ref_index != SENTINEL:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
337
            ref_line = self._raw_lines + ref_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
338
            if (ref_line.hash == line.hash):
339
                PyObject_Cmp(ref_line.data, line.data, &compare_result)
340
                if compare_result == 0:
341
                    break
342
            location = (location + 1) & self._hashtable_bitmask
343
            ref_index = self._hashtable[location].line_index
344
        return location
345
346
    def _py_find_hash_position(self, line):
347
        """Used only for testing.
348
349
        Return the location where this fits in the hash table
350
        """
351
        cdef _raw_line raw_line
352
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
353
        self._line_to_raw_line(<PyObject *>line, &raw_line)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
354
        return self._find_hash_position(&raw_line)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
355
        
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
356
    def _inspect_left_lines(self):
357
        """Used only for testing.
358
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
359
        :return: None if _raw_lines is NULL,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
360
            else [(object, hash_val, hash_loc, next_val)] for each node in raw
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
361
                  lines.
362
        """
363
        cdef Py_ssize_t i
364
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
365
        if self._raw_lines == NULL:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
366
            return None
367
368
        result = []
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
369
        for i from 0 <= i < self._len_lines:
370
            PyList_Append(result,
371
                          (<object>self._raw_lines[i].data,
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
372
                           self._raw_lines[i].hash,
373
                           self._raw_lines[i].hash_offset,
374
                           self._raw_lines[i].next_line_index,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
375
                           ))
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
376
        return result
377
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
378
    def _inspect_hash_table(self):
379
        """Used only for testing.
380
381
        This iterates the hash table, and returns 'interesting' entries.
382
        :return: (total_size, [(offset, line_index, count)] for entries that
383
            are not empty.
384
        """
385
        cdef int i
386
387
        interesting = []
388
        for i from 0 <= i < self._hashtable_size:
389
            if self._hashtable[i].line_index != SENTINEL:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
390
                PyList_Append(interesting,
391
                              (i, self._hashtable[i].line_index,
392
                               self._hashtable[i].count))
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
393
        return (self._hashtable_size, interesting)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
394
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
395
    def get_matches(self, line):
396
        """Return the lines which match the line in right."""
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
397
        cdef Py_ssize_t hash_offset
398
        cdef _raw_line raw_line
399
        cdef _hash_bucket cur_bucket
400
        cdef Py_ssize_t cur_line_idx
401
402
        self._line_to_raw_line(<PyObject *>line, &raw_line)
403
        hash_offset = self._find_hash_position(&raw_line)
404
        cur_bucket = self._hashtable[hash_offset]
405
        cur_line_idx = cur_bucket.line_index
406
        if cur_line_idx == SENTINEL:
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
407
            return None
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
408
        result = []
409
        while cur_line_idx != SENTINEL:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
410
            PyList_Append(result, cur_line_idx)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
411
            cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
412
        assert len(result) == cur_bucket.count
413
        return result
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
414
415
    def _get_matching_lines(self):
416
        """Return a dictionary showing matching lines."""
417
        matching = {}
418
        for line in self.lines:
419
            matching[line] = self.get_matches(line)
420
        return matching
421
422
    def get_idx_matches(self, right_idx):
423
        """Return the left lines matching the right line at the given offset."""
424
        line = self._right_lines[right_idx]
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
425
        return self.get_matches(line)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
426
427
    def extend_lines(self, lines, index):
428
        """Add more lines to the left-lines list.
429
430
        :param lines: A list of lines to add
431
        :param index: A True/False for each node to define if it should be
432
            indexed.
433
        """
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
434
        cdef Py_ssize_t orig_len
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
435
        cdef Py_ssize_t min_new_hash_size
436
        assert len(lines) == len(index)
437
        min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
438
                                                            len(lines))
439
        if self._hashtable_size >= min_new_hash_size:
440
            # Just add the new lines, don't bother resizing the hash table
441
            self.lines.extend(lines)
442
            self._extend_hash_table(index)
443
            return
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
444
        orig_len = len(self.lines)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
445
        self.lines.extend(lines)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
446
        self._lines_to_raw_lines(self.lines)
447
        for idx, val in enumerate(index):
448
            if not val:
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
449
                self._raw_lines[orig_len + idx].flags &= ~(<int>INDEXED)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
450
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
451
452
    def set_right_lines(self, lines):
453
        """Set the lines we will be matching against."""
454
        self._right_lines = lines
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
455
456
457
def _get_longest_match(equivalence_table, pos, max_pos, locations):
458
    """Get the longest possible match for the current position."""
459
    cdef EquivalenceTable eq
460
    cdef int cpos
461
462
    eq = equivalence_table
463
    cpos = pos
464
    range_start = pos
465
    range_len = 0
466
    copy_ends = None
467
468
    while pos < max_pos:
469
        if locations is None:
470
            locations = equivalence_table.get_idx_matches(pos)
471
        if locations is None:
472
            # No more matches, just return whatever we have, but we know that
473
            # this last position is not going to match anything
474
            pos = pos + 1
475
            break
476
        else:
477
            if copy_ends is None:
478
                # We are starting a new range
479
                copy_ends = []
480
                for loc in locations:
481
                    copy_ends.append(loc + 1)
482
                range_len = 1
483
                locations = None # Consumed
484
            else:
485
                # We are currently in the middle of a match
486
                next_locations = set(copy_ends).intersection(locations)
487
                if len(next_locations):
488
                    # range continues
489
                    copy_ends = []
490
                    for loc in next_locations:
491
                        copy_ends.append(loc + 1)
492
                    range_len = range_len + 1
493
                    locations = None # Consumed
494
                else:
495
                    # But we are done with this match, we should be
496
                    # starting a new one, though. We will pass back 'locations'
497
                    # so that we don't have to do another lookup.
498
                    break
499
        pos = pos + 1
500
    if copy_ends is None:
501
        return None, pos, locations
502
    return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations
503
504