/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
1
# Copyright (C) 2008 Canonical Limited.
2
# 
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
6
# 
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
# GNU General Public License for more details.
11
# 
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
15
# 
16
17
"""Compiled extensions for doing compression."""
18
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
19
cdef extern from *:
20
    ctypedef unsigned long size_t
21
    void * malloc(size_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
22
    void * realloc(void *, size_t)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
23
    void free(void *)
24
25
cdef extern from "Python.h":
26
    struct _PyObject:
27
        pass
28
    ctypedef _PyObject PyObject
29
    PyObject *PySequence_Fast(object, char *) except NULL
30
    Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
31
    PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
32
    PyObject *PyList_GET_ITEM(object, Py_ssize_t)
33
    int PyList_Append(object, object) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
34
    long PyObject_Hash(PyObject *) except -1
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
35
    # We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
36
    # if there is an exception *for* us.
37
    int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
38
    int PyObject_Not(PyObject *) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
39
    void Py_DECREF(PyObject *)
40
    void Py_INCREF(PyObject *)
41
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
42
cdef enum _raw_line_flags:
43
    INDEXED  = 0x01
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
44
45
cdef struct _raw_line:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
46
    long hash              # Cached form of the hash for this entry
47
    Py_ssize_t hash_offset # The location in the hash table for this object
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
48
    Py_ssize_t next_line_index # Next line which is equivalent to this one
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
49
    int flags              # status flags
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
50
    PyObject *data         # Raw pointer to the original line
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
51
52
53
cdef struct _hash_bucket:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
54
    Py_ssize_t line_index # First line in the left side for this bucket
55
    Py_ssize_t count      # Number of equivalent lines, DO we even need this?
56
57
58
cdef Py_ssize_t SENTINEL
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
59
SENTINEL = -1
60
61
62
cdef void *safe_malloc(size_t count) except NULL:
63
    cdef void *result
64
    result = malloc(count)
65
    if result == NULL:
66
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
    return result
68
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
69
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
71
    cdef void *result
72
    result = realloc(old, count)
73
    if result == NULL:
74
        raise MemoryError('Failed to reallocate to %d bytes of memory'
75
                          % (count,))
76
    return result
77
78
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
79
cdef int safe_free(void **val) except -1:
80
    assert val != NULL
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
81
    if val[0] != NULL:
82
        free(val[0])
83
        val[0] = NULL
84
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
85
86
cdef class EquivalenceTable:
87
    """This tracks equivalencies between lists of hashable objects.
88
89
    :ivar lines: The 'static' lines that will be preserved between runs.
90
    """
91
92
    cdef readonly object lines
93
    cdef readonly object _right_lines
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
94
    cdef Py_ssize_t _hashtable_size
95
    cdef Py_ssize_t _hashtable_bitmask
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
96
    cdef _hash_bucket *_hashtable
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
97
    cdef _raw_line *_raw_lines
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
98
    cdef Py_ssize_t _len_lines
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
99
100
    def __init__(self, lines):
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
101
        self.lines = list(lines)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
102
        self._right_lines = None
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
103
        self._hashtable_size = 0
104
        self._hashtable = NULL
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
105
        self._raw_lines = NULL
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
106
        self._len_lines = 0
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
107
        self._lines_to_raw_lines(lines)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
108
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
109
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
110
    def __dealloc__(self):
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
111
        safe_free(<void**>&self._hashtable)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
112
        safe_free(<void**>&self._raw_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
114
    cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
115
        """Convert a single PyObject into a raw line."""
116
        raw_line.hash = PyObject_Hash(line)
117
        raw_line.next_line_index = SENTINEL
118
        raw_line.hash_offset = SENTINEL
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
119
        raw_line.flags = INDEXED
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
120
        raw_line.data = line
121
        
122
    cdef int _lines_to_raw_lines(self, object lines) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
123
        """Load a sequence of objects into the _raw_line format"""
124
        cdef Py_ssize_t count, i
125
        cdef PyObject *seq
126
        cdef PyObject *item
127
        cdef _raw_line *raw
128
129
        # Do we want to use PySequence_Fast, or just assume that it is a list
130
        # Also, do we need to decref the return value?
131
        # http://www.python.org/doc/current/api/sequence.html
132
        seq = PySequence_Fast(lines, "expected a sequence")
133
        try:
134
            count = PySequence_Fast_GET_SIZE(seq)
135
            if count == 0:
136
                return 0
137
            raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
138
            safe_free(<void**>&self._raw_lines)
139
            self._raw_lines = raw
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
140
            self._len_lines = count
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
141
142
            for i from 0 <= i < count:
143
                item = PySequence_Fast_GET_ITEM(seq, i)
144
                # NB: We don't Py_INCREF the data pointer, because we know we
145
                #     maintain a pointer to the item in self.lines or
146
                #     self._right_lines
147
                # TODO: Do we even need to track a data pointer here? It would
148
                #       be less memory, and we *could* just look it up in the
149
                #       appropriate line list.
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
150
                self._line_to_raw_line(item, &raw[i])
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
151
        finally:
152
            # TODO: Unfortunately, try/finally always generates a compiler
153
            #       warning about a possibly unused variable :(
154
            Py_DECREF(seq)
155
        return count
156
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
157
    cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
158
        """Determine the smallest hash size that can reasonably fit the data.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
159
        
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
160
        :param needed: The number of entries we might be inserting into
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
161
            the hash (assuming there are no duplicate lines.)
162
        :return: The number of hash table entries to use. This will always be a
163
            power of two.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
164
        """
165
        cdef Py_ssize_t hash_size
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
166
        cdef Py_ssize_t min_size
167
168
        # TODO: Another alternative would be to actually count how full the
169
        #       hash-table is, and decide if we need to grow it based on
170
        #       density. That can take into account duplicated lines. Though if
171
        #       we compress well, there should be a minimal amount of
172
        #       duplicated lines in the output.
173
174
        # At the bare minimum, we could fit all entries into a 'needed'
175
        # size hash table. However, any collision would then have a long way to
176
        # traverse before it could find a 'free' slot.
177
        # So we set the minimum size to give us 33% empty slots.
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
178
        min_size = <Py_ssize_t>(needed * 2)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
179
        hash_size = 1
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
180
        while hash_size < min_size:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
181
            hash_size = hash_size << 1
182
        return hash_size
183
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
184
    def _py_compute_minimum_hash_size(self, needed):
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
185
        """Expose _compute_minimum_hash_size to python for testing."""
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
186
        return self._compute_minimum_hash_size(needed)
187
188
    cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
189
        """Determine a reasonable hash size, assuming some room for growth.
190
        
191
        :param needed: The number of entries we might be inserting into
192
            the hash (assuming there are no duplicate lines.)
193
        :return: The number of hash table entries to use. This will always be a
194
            power of two.
195
        """
196
        cdef Py_ssize_t hash_size
197
        cdef Py_ssize_t min_size
198
199
        # We start off with a 8k hash (after doubling), because there isn't a
200
        # lot of reason to go smaller than that (this class isn't one you'll be
201
        # instantiating thousands of, and you are always likely to grow here.)
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
202
        hash_size = 2048
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
203
        while hash_size < needed:
204
            hash_size = hash_size << 1
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
205
        # And we always give at least 4x blank space
206
        hash_size = hash_size << 2
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
207
        return hash_size
208
209
    def _py_compute_recommended_hash_size(self, needed):
210
        """Expose _compute_recommended_hash_size to python for testing."""
211
        return self._compute_recommended_hash_size(needed)
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
212
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
213
    cdef int _build_hash_table(self) except -1:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
214
        """Build the hash table 'from scratch'."""
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
215
        cdef Py_ssize_t hash_size
216
        cdef Py_ssize_t hash_bitmask
217
        cdef Py_ssize_t i
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
218
        cdef _raw_line *cur_line 
219
        cdef Py_ssize_t hash_offset
220
        cdef _hash_bucket *cur_bucket
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
221
        cdef _hash_bucket *new_hashtable
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
222
223
        # Hash size is a power of 2
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
224
        hash_size = self._compute_recommended_hash_size(self._len_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
225
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
226
        new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
227
                                                   hash_size)
228
        safe_free(<void**>&self._hashtable)
229
        self._hashtable = new_hashtable
230
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
231
        self._hashtable_size = hash_size
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
232
        for i from 0 <= i < hash_size:
233
            self._hashtable[i].line_index = SENTINEL
234
            self._hashtable[i].count = 0
235
236
        # Turn the hash size into a bitmask
237
        self._hashtable_bitmask = hash_size - 1
238
239
        # Iterate backwards, because it makes it easier to insert items into
240
        # the hash (you just change the head pointer, and everything else keeps
241
        # pointing to the same location).
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
242
        for i from self._len_lines > i >= 0:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
243
            cur_line = self._raw_lines + i
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
244
            if not (cur_line.flags & INDEXED == INDEXED):
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
245
                continue
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
246
            hash_offset = self._find_hash_position(cur_line)
247
248
            # Point this line to the location in the hash table
249
            cur_line.hash_offset = hash_offset
250
            # And make this line the head of the hash table
251
            cur_bucket = self._hashtable + hash_offset
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
252
            cur_line.next_line_index = cur_bucket.line_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
253
            cur_bucket.line_index = i
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
254
            cur_bucket.count = cur_bucket.count + 1
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
255
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
256
    cdef int _extend_raw_lines(self, object index) except -1:
257
        """Add the last N lines from self.lines into the raw_lines array."""
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
258
        cdef Py_ssize_t new_count
259
        cdef Py_ssize_t new_total_len
260
        cdef Py_ssize_t old_len
261
        cdef PyObject *item
262
        cdef PyObject *should_index
263
        cdef Py_ssize_t i
264
        cdef Py_ssize_t line_index
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
265
        cdef _raw_line *cur_line
266
        cdef PyObject *seq_index
267
268
        seq_index = PySequence_Fast(index, "expected a sequence for index")
269
        try:
270
            old_len = self._len_lines
271
            new_count = PySequence_Fast_GET_SIZE(seq_index) 
272
            new_total_len = new_count + self._len_lines
273
            self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
274
                                    new_total_len * sizeof(_raw_line))
275
            self._len_lines = new_total_len
276
            # Now that we have enough space, start adding the new lines
277
            # into the array. These are done in forward order.
278
            for i from 0 <= i < new_count:
279
                line_index = i + old_len
280
                cur_line = self._raw_lines + line_index
281
                item = PyList_GET_ITEM(self.lines, line_index)
282
                self._line_to_raw_line(item, cur_line)
283
                should_index = PySequence_Fast_GET_ITEM(seq_index, i)
284
                if PyObject_Not(should_index):
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
285
                    cur_line.flags = cur_line.flags & ~(<int>INDEXED)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
286
        finally:
287
            Py_DECREF(seq_index)
288
289
    cdef int _extend_hash_table(self, Py_ssize_t count) except -1:
290
        """Add the last N entries in self.lines to the hash table.
291
292
        :param index: A sequence that declares whether each node should be
293
            INDEXED or not.
294
        """
295
        cdef Py_ssize_t line_index
296
        cdef Py_ssize_t start
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
297
        cdef _raw_line *cur_line
298
        cdef _raw_line *next_line
299
        cdef Py_ssize_t hash_offset
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
300
        cdef _hash_bucket *cur_bucket
301
302
        start = self._len_lines - count
303
304
        for line_index from start <= line_index < self._len_lines:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
305
            cur_line = self._raw_lines + line_index
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
306
            if cur_line.flags & INDEXED != INDEXED:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
307
                continue
308
            hash_offset = self._find_hash_position(cur_line)
309
310
            # Point this line to the location in the hash table
311
            cur_line.hash_offset = hash_offset
312
313
            # Make this line the tail of the hash table
314
            cur_bucket = self._hashtable + hash_offset
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
315
            cur_bucket.count = cur_bucket.count + 1
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
316
            if cur_bucket.line_index == SENTINEL:
317
                cur_bucket.line_index = line_index
318
                continue
319
            # We need to track through the pointers and insert this at
320
            # the end
321
            next_line = self._raw_lines + cur_bucket.line_index
322
            while next_line.next_line_index != SENTINEL:
323
                next_line = self._raw_lines + next_line.next_line_index
324
            next_line.next_line_index = line_index
325
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
326
    cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
327
        """Find the node in the hash which defines this line.
328
329
        Each bucket in the hash table points at exactly 1 equivalent line. If 2
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
330
        objects would collide, we just increment to the next bucket until we
331
        get to an empty bucket that is either empty or exactly matches this
332
        object.
333
334
        :return: The offset in the hash table for this entry
335
        """
336
        cdef Py_ssize_t location
337
        cdef _raw_line *ref_line
338
        cdef Py_ssize_t ref_index
339
        cdef int compare_result
340
341
        location = line.hash & self._hashtable_bitmask
342
        ref_index = self._hashtable[location].line_index
343
        while ref_index != SENTINEL:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
344
            ref_line = self._raw_lines + ref_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
345
            if (ref_line.hash == line.hash):
346
                PyObject_Cmp(ref_line.data, line.data, &compare_result)
347
                if compare_result == 0:
348
                    break
349
            location = (location + 1) & self._hashtable_bitmask
350
            ref_index = self._hashtable[location].line_index
351
        return location
352
353
    def _py_find_hash_position(self, line):
354
        """Used only for testing.
355
356
        Return the location where this fits in the hash table
357
        """
358
        cdef _raw_line raw_line
359
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
360
        self._line_to_raw_line(<PyObject *>line, &raw_line)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
361
        return self._find_hash_position(&raw_line)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
362
        
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
363
    def _inspect_left_lines(self):
364
        """Used only for testing.
365
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
366
        :return: None if _raw_lines is NULL,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
367
            else [(object, hash_val, hash_loc, next_val)] for each node in raw
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
368
                  lines.
369
        """
370
        cdef Py_ssize_t i
371
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
372
        if self._raw_lines == NULL:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
373
            return None
374
375
        result = []
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
376
        for i from 0 <= i < self._len_lines:
377
            PyList_Append(result,
378
                          (<object>self._raw_lines[i].data,
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
379
                           self._raw_lines[i].hash,
380
                           self._raw_lines[i].hash_offset,
381
                           self._raw_lines[i].next_line_index,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
382
                           ))
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
383
        return result
384
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
385
    def _inspect_hash_table(self):
386
        """Used only for testing.
387
388
        This iterates the hash table, and returns 'interesting' entries.
389
        :return: (total_size, [(offset, line_index, count)] for entries that
390
            are not empty.
391
        """
392
        cdef int i
393
394
        interesting = []
395
        for i from 0 <= i < self._hashtable_size:
396
            if self._hashtable[i].line_index != SENTINEL:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
397
                PyList_Append(interesting,
398
                              (i, self._hashtable[i].line_index,
399
                               self._hashtable[i].count))
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
400
        return (self._hashtable_size, interesting)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
401
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
402
    def get_matches(self, line):
403
        """Return the lines which match the line in right."""
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
404
        cdef Py_ssize_t count
405
        cdef Py_ssize_t i
406
        cdef Py_ssize_t *matches
407
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
408
        matches = NULL
409
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
410
        try:
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
411
            count = self.get_raw_matches(<PyObject *>line, &matches)
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
412
            if count == 0:
413
                return None
414
            result = []
415
            for i from 0 <= i < count:
416
                PyList_Append(result, matches[i])
417
        finally:
418
            safe_free(<void**>&matches)
419
        return result
420
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
421
    cdef Py_ssize_t get_raw_matches(self, PyObject *line,
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
422
                                    Py_ssize_t **matches) except -1:
423
        """Get all the possible entries that match the given location.
424
425
        :param line: A Python line which we want to match
426
        :param pos: The index position in _right_lines
427
        :param matches: A variable to hold an array containing all the matches.
428
            This will be allocated using malloc, and the caller is responsible
429
            to free it with free(). If there are no matches, no memory will be
430
            allocated.
431
        :retrun: The number of matched positions
432
        """
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
433
        cdef Py_ssize_t hash_offset
434
        cdef _raw_line raw_line
435
        cdef _hash_bucket cur_bucket
436
        cdef Py_ssize_t cur_line_idx
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
437
        cdef Py_ssize_t count
438
        cdef Py_ssize_t *local_matches
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
439
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
440
        self._line_to_raw_line(line, &raw_line)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
441
        hash_offset = self._find_hash_position(&raw_line)
442
        cur_bucket = self._hashtable[hash_offset]
443
        cur_line_idx = cur_bucket.line_index
444
        if cur_line_idx == SENTINEL:
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
445
            return 0
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
446
        local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *
447
                                                 (cur_bucket.count + 1))
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
448
        matches[0] = local_matches
449
        for count from 0 <= count < cur_bucket.count:
450
            assert cur_line_idx != SENTINEL
451
            local_matches[count] = cur_line_idx
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
452
            cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
453
        return cur_bucket.count
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
454
455
    def _get_matching_lines(self):
456
        """Return a dictionary showing matching lines."""
457
        matching = {}
458
        for line in self.lines:
459
            matching[line] = self.get_matches(line)
460
        return matching
461
462
    def get_idx_matches(self, right_idx):
463
        """Return the left lines matching the right line at the given offset."""
464
        line = self._right_lines[right_idx]
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
465
        return self.get_matches(line)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
466
467
    def extend_lines(self, lines, index):
468
        """Add more lines to the left-lines list.
469
470
        :param lines: A list of lines to add
471
        :param index: A True/False for each node to define if it should be
472
            indexed.
473
        """
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
474
        cdef Py_ssize_t orig_len
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
475
        cdef Py_ssize_t min_new_hash_size
476
        assert len(lines) == len(index)
477
        min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
478
                                                            len(lines))
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
479
        orig_len = len(self.lines)
480
        self.lines.extend(lines)
481
        self._extend_raw_lines(index)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
482
        if self._hashtable_size >= min_new_hash_size:
483
            # Just add the new lines, don't bother resizing the hash table
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
484
            self._extend_hash_table(len(index))
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
485
            return
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
486
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
487
488
    def set_right_lines(self, lines):
489
        """Set the lines we will be matching against."""
490
        self._right_lines = lines
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
491
492
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
493
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,
494
                                 Py_ssize_t *new_values, Py_ssize_t new_count) except -1:
495
    """Intersect the base values with the new values.
496
497
    It is assumed that both base and new_values are sorted in ascending order,
498
    with no duplicates.
499
500
    The values in 'base' will be modified to only contain the entries that are
501
    in both base and new. If nothing matches, 'base' will not be modified
502
503
    :param base: An array of base values
504
    :param base_count: The length of the base array
505
    :param new_values: An array of values to compare to base, will be modified
506
        to only contain matching entries
507
    :param new_count: The number of new entries
508
    :return: The count of matching values
509
    """
510
    cdef Py_ssize_t *cur_base
511
    cdef Py_ssize_t *cur_new
512
    cdef Py_ssize_t *cur_out
513
    cdef Py_ssize_t count
514
    cdef Py_ssize_t *base_end
515
    cdef Py_ssize_t *new_end
516
517
    cur_base = base
518
    cur_out = base
519
    cur_new = new_values
520
    base_end = base + base_count
521
    new_end = new_values + new_count
522
    count = 0
523
524
    while cur_base < base_end and cur_new < new_end:
525
        if cur_base[0] == cur_new[0]:
526
            # This may be assigning to self.
527
            cur_out[0] = cur_base[0]
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
528
            count = count + 1
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
529
            cur_out = cur_out + 1
530
            # We matched, so move both pointers
531
            cur_base = cur_base + 1
532
            cur_new = cur_new + 1
533
        elif cur_base[0] < cur_new[0]:
534
            # cur_base is "behind" so increment it
535
            cur_base = cur_base + 1
536
        else:
537
            # cur_new must be "behind" move its pointer
538
            cur_new = cur_new + 1
539
    return count
540
541
542
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):
543
    cdef int i
544
    lst = []
545
    for i from 0 <= i < count:
546
        lst.append(array[i])
547
    return lst
548
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
549
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
550
def _get_longest_match(equivalence_table, pos, max_pos, locations):
551
    """Get the longest possible match for the current position."""
552
    cdef EquivalenceTable eq
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
553
    cdef Py_ssize_t cpos
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
554
    cdef Py_ssize_t cmax_pos
555
    cdef Py_ssize_t *matches
556
    cdef Py_ssize_t match_count
557
    cdef PyObject *line
558
    cdef Py_ssize_t *copy_ends
559
    cdef Py_ssize_t copy_count
560
    cdef Py_ssize_t range_len
561
    cdef Py_ssize_t in_start
562
    cdef Py_ssize_t i
563
    cdef Py_ssize_t intersect_count
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
564
565
    eq = equivalence_table
566
    cpos = pos
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
567
    cmax_pos = max_pos
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
568
    range_start = pos
569
    range_len = 0
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
570
    matches = NULL
571
    match_count = 0
572
    copy_ends = NULL
573
    copy_count = 0
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
574
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
575
    # TODO: Handle when locations is not None
576
    while cpos < cmax_pos:
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
577
        # TODO: Instead of grabbing the full list of matches and then doing an
578
        #       intersection, use a helper that does the intersection
579
        #       as-it-goes.
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
580
        line = PyList_GET_ITEM(eq._right_lines, cpos)
581
        safe_free(<void**>&matches)
582
        match_count = eq.get_raw_matches(line, &matches)
583
        if match_count == 0:
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
584
            # No more matches, just return whatever we have, but we know that
585
            # this last position is not going to match anything
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
586
            cpos = cpos + 1
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
587
            break
588
        else:
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
589
            if copy_ends == NULL:
590
                # We are starting a new range, just steal the matches
591
                copy_ends = matches
592
                copy_count = match_count
593
                matches = NULL
594
                match_count = 0
595
                for i from 0 <= i < copy_count:
596
                    copy_ends[i] = copy_ends[i] + 1
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
597
                range_len = 1
598
            else:
599
                # We are currently in the middle of a match
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
600
                intersect_count = intersect_sorted(copy_ends, copy_count,
601
                                                   matches, match_count)
602
                if intersect_count == 0:
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
603
                    # But we are done with this match, we should be
604
                    # starting a new one, though. We will pass back 'locations'
605
                    # so that we don't have to do another lookup.
606
                    break
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
607
                else:
608
                    copy_count = intersect_count
609
                    # range continues, step the copy ends
610
                    for i from 0 <= i < copy_count:
611
                        copy_ends[i] = copy_ends[i] + 1
612
                    range_len = range_len + 1
613
                    safe_free(<void **>&matches)
614
                    match_count = 0
615
        cpos = cpos + 1
616
    if matches == NULL:
617
        # We consumed all of our matches
618
        locations = None
619
    else:
620
        locations = array_as_list(matches, match_count)
621
        safe_free(<void **>&matches)
622
    if copy_ends == NULL or copy_count == 0:
623
        # We never had a match
624
        return None, cpos, locations
625
    # Because copy_ends is always sorted, we know the 'min' is just the first
626
    # node
627
    in_start = copy_ends[0]
628
    safe_free(<void**>&copy_ends)
629
    return ((in_start - range_len), range_start, range_len), cpos, locations