/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
1
# Copyright (C) 2008 Canonical Limited.
2
# 
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as published
5
# by the Free Software Foundation.
6
# 
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
# GNU General Public License for more details.
11
# 
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
15
# 
16
17
"""Compiled extensions for doing compression."""
18
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
19
cdef extern from *:
20
    ctypedef unsigned long size_t
21
    void * malloc(size_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
22
    void * realloc(void *, size_t)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
23
    void free(void *)
24
25
cdef extern from "Python.h":
26
    struct _PyObject:
27
        pass
28
    ctypedef _PyObject PyObject
29
    PyObject *PySequence_Fast(object, char *) except NULL
30
    Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *)
31
    PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
32
    PyObject *PyList_GET_ITEM(object, Py_ssize_t)
33
    int PyList_Append(object, object) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
34
    long PyObject_Hash(PyObject *) except -1
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
35
    # We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
36
    # if there is an exception *for* us.
37
    int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
38
    int PyObject_Not(PyObject *) except -1
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
39
    void Py_DECREF(PyObject *)
40
    void Py_INCREF(PyObject *)
41
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
42
cdef enum _raw_line_flags:
43
    INDEXED  = 0x01
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
44
45
cdef struct _raw_line:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
46
    long hash              # Cached form of the hash for this entry
47
    Py_ssize_t hash_offset # The location in the hash table for this object
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
48
    Py_ssize_t next_line_index # Next line which is equivalent to this one
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
49
    int flags              # status flags
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
50
    PyObject *data         # Raw pointer to the original line
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
51
52
53
cdef struct _hash_bucket:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
54
    Py_ssize_t line_index # First line in the left side for this bucket
55
    Py_ssize_t count      # Number of equivalent lines, DO we even need this?
56
57
58
cdef Py_ssize_t SENTINEL
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
59
SENTINEL = -1
60
61
62
cdef void *safe_malloc(size_t count) except NULL:
63
    cdef void *result
64
    result = malloc(count)
65
    if result == NULL:
66
        raise MemoryError('Failed to allocate %d bytes of memory' % (count,))
67
    return result
68
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
69
70
cdef void *safe_realloc(void * old, size_t count) except NULL:
71
    cdef void *result
72
    result = realloc(old, count)
73
    if result == NULL:
74
        raise MemoryError('Failed to reallocate to %d bytes of memory'
75
                          % (count,))
76
    return result
77
78
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
79
cdef int safe_free(void **val) except -1:
80
    assert val != NULL
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
81
    if val[0] != NULL:
82
        free(val[0])
83
        val[0] = NULL
84
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
85
86
cdef class EquivalenceTable:
87
    """This tracks equivalencies between lists of hashable objects.
88
89
    :ivar lines: The 'static' lines that will be preserved between runs.
90
    """
91
92
    cdef readonly object lines
93
    cdef readonly object _right_lines
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
94
    cdef Py_ssize_t _hashtable_size
95
    cdef Py_ssize_t _hashtable_bitmask
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
96
    cdef _hash_bucket *_hashtable
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
97
    cdef _raw_line *_raw_lines
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
98
    cdef Py_ssize_t _len_lines
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
99
100
    def __init__(self, lines):
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
101
        self.lines = list(lines)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
102
        self._right_lines = None
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
103
        self._hashtable_size = 0
104
        self._hashtable = NULL
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
105
        self._raw_lines = NULL
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
106
        self._len_lines = 0
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
107
        self._lines_to_raw_lines(lines)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
108
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
109
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
110
    def __dealloc__(self):
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
111
        safe_free(<void**>&self._hashtable)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
112
        safe_free(<void**>&self._raw_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
113
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
114
    cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1:
115
        """Convert a single PyObject into a raw line."""
116
        raw_line.hash = PyObject_Hash(line)
117
        raw_line.next_line_index = SENTINEL
118
        raw_line.hash_offset = SENTINEL
0.18.22 by John Arbash Meinel
Switch the indexed integer into a flags integer.
119
        raw_line.flags = INDEXED
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
120
        raw_line.data = line
121
        
122
    cdef int _lines_to_raw_lines(self, object lines) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
123
        """Load a sequence of objects into the _raw_line format"""
124
        cdef Py_ssize_t count, i
125
        cdef PyObject *seq
126
        cdef PyObject *item
127
        cdef _raw_line *raw
128
129
        # Do we want to use PySequence_Fast, or just assume that it is a list
130
        # Also, do we need to decref the return value?
131
        # http://www.python.org/doc/current/api/sequence.html
132
        seq = PySequence_Fast(lines, "expected a sequence")
133
        try:
134
            count = PySequence_Fast_GET_SIZE(seq)
135
            if count == 0:
136
                return 0
137
            raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line))
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
138
            safe_free(<void**>&self._raw_lines)
139
            self._raw_lines = raw
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
140
            self._len_lines = count
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
141
142
            for i from 0 <= i < count:
143
                item = PySequence_Fast_GET_ITEM(seq, i)
144
                # NB: We don't Py_INCREF the data pointer, because we know we
145
                #     maintain a pointer to the item in self.lines or
146
                #     self._right_lines
147
                # TODO: Do we even need to track a data pointer here? It would
148
                #       be less memory, and we *could* just look it up in the
149
                #       appropriate line list.
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
150
                self._line_to_raw_line(item, &raw[i])
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
151
        finally:
152
            # TODO: Unfortunately, try/finally always generates a compiler
153
            #       warning about a possibly unused variable :(
154
            Py_DECREF(seq)
155
        return count
156
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
157
    cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed):
158
        """Determine the smallest hash size that can reasonably fit the data.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
159
        
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
160
        :param needed: The number of entries we might be inserting into
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
161
            the hash (assuming there are no duplicate lines.)
162
        :return: The number of hash table entries to use. This will always be a
163
            power of two.
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
164
        """
165
        cdef Py_ssize_t hash_size
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
166
        cdef Py_ssize_t min_size
167
168
        # TODO: Another alternative would be to actually count how full the
169
        #       hash-table is, and decide if we need to grow it based on
170
        #       density. That can take into account duplicated lines. Though if
171
        #       we compress well, there should be a minimal amount of
172
        #       duplicated lines in the output.
173
174
        # At the bare minimum, we could fit all entries into a 'needed'
175
        # size hash table. However, any collision would then have a long way to
176
        # traverse before it could find a 'free' slot.
177
        # So we set the minimum size to give us 33% empty slots.
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
178
        min_size = <Py_ssize_t>(needed * 2)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
179
        hash_size = 1
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
180
        while hash_size < min_size:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
181
            hash_size = hash_size << 1
182
        return hash_size
183
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
184
    def _py_compute_minimum_hash_size(self, needed):
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
185
        """Expose _compute_minimum_hash_size to python for testing."""
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
186
        return self._compute_minimum_hash_size(needed)
187
188
    cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed):
189
        """Determine a reasonable hash size, assuming some room for growth.
190
        
191
        :param needed: The number of entries we might be inserting into
192
            the hash (assuming there are no duplicate lines.)
193
        :return: The number of hash table entries to use. This will always be a
194
            power of two.
195
        """
196
        cdef Py_ssize_t hash_size
197
        cdef Py_ssize_t min_size
198
199
        # We start off with a 8k hash (after doubling), because there isn't a
200
        # lot of reason to go smaller than that (this class isn't one you'll be
201
        # instantiating thousands of, and you are always likely to grow here.)
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
202
        hash_size = 2048
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
203
        while hash_size < needed:
204
            hash_size = hash_size << 1
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
205
        # And we always give at least 4x blank space
206
        hash_size = hash_size << 2
0.18.16 by John Arbash Meinel
Test the recommended versus minimum hash table sizes.
207
        return hash_size
208
209
    def _py_compute_recommended_hash_size(self, needed):
210
        """Expose _compute_recommended_hash_size to python for testing."""
211
        return self._compute_recommended_hash_size(needed)
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
212
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
213
    cdef int _build_hash_table(self) except -1:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
214
        """Build the hash table 'from scratch'."""
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
215
        cdef Py_ssize_t hash_size
216
        cdef Py_ssize_t hash_bitmask
217
        cdef Py_ssize_t i
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
218
        cdef _raw_line *cur_line 
219
        cdef Py_ssize_t hash_offset
220
        cdef _hash_bucket *cur_bucket
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
221
        cdef _hash_bucket *new_hashtable
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
222
223
        # Hash size is a power of 2
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
224
        hash_size = self._compute_recommended_hash_size(self._len_lines)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
225
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
226
        new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) *
227
                                                   hash_size)
228
        safe_free(<void**>&self._hashtable)
229
        self._hashtable = new_hashtable
230
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
231
        self._hashtable_size = hash_size
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
232
        for i from 0 <= i < hash_size:
233
            self._hashtable[i].line_index = SENTINEL
234
            self._hashtable[i].count = 0
235
236
        # Turn the hash size into a bitmask
237
        self._hashtable_bitmask = hash_size - 1
238
239
        # Iterate backwards, because it makes it easier to insert items into
240
        # the hash (you just change the head pointer, and everything else keeps
241
        # pointing to the same location).
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
242
        for i from self._len_lines > i >= 0:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
243
            cur_line = self._raw_lines + i
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
244
            if not (cur_line.flags & INDEXED == INDEXED):
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
245
                continue
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
246
            hash_offset = self._find_hash_position(cur_line)
247
248
            # Point this line to the location in the hash table
249
            cur_line.hash_offset = hash_offset
250
            # And make this line the head of the hash table
251
            cur_bucket = self._hashtable + hash_offset
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
252
            cur_line.next_line_index = cur_bucket.line_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
253
            cur_bucket.line_index = i
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
254
            cur_bucket.count = cur_bucket.count + 1
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
255
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
256
    cdef int _extend_raw_lines(self, object index) except -1:
257
        """Add the last N lines from self.lines into the raw_lines array."""
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
258
        cdef Py_ssize_t new_count
259
        cdef Py_ssize_t new_total_len
260
        cdef Py_ssize_t old_len
261
        cdef PyObject *item
262
        cdef PyObject *should_index
263
        cdef Py_ssize_t i
264
        cdef Py_ssize_t line_index
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
265
        cdef _raw_line *cur_line
266
        cdef PyObject *seq_index
0.18.37 by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc.
267
        cdef Py_ssize_t actual_new_len
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
268
269
        seq_index = PySequence_Fast(index, "expected a sequence for index")
270
        try:
271
            old_len = self._len_lines
272
            new_count = PySequence_Fast_GET_SIZE(seq_index) 
273
            new_total_len = new_count + self._len_lines
0.18.37 by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc.
274
            actual_new_len = 1
275
            while actual_new_len < new_total_len:
276
                actual_new_len = actual_new_len << 1
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
277
            self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines,
0.18.37 by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc.
278
                                    actual_new_len * sizeof(_raw_line))
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
279
            self._len_lines = new_total_len
280
            # Now that we have enough space, start adding the new lines
281
            # into the array. These are done in forward order.
282
            for i from 0 <= i < new_count:
283
                line_index = i + old_len
284
                cur_line = self._raw_lines + line_index
285
                item = PyList_GET_ITEM(self.lines, line_index)
286
                self._line_to_raw_line(item, cur_line)
287
                should_index = PySequence_Fast_GET_ITEM(seq_index, i)
288
                if PyObject_Not(should_index):
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
289
                    cur_line.flags = cur_line.flags & ~(<int>INDEXED)
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
290
        finally:
291
            Py_DECREF(seq_index)
292
293
    cdef int _extend_hash_table(self, Py_ssize_t count) except -1:
294
        """Add the last N entries in self.lines to the hash table.
295
296
        :param index: A sequence that declares whether each node should be
297
            INDEXED or not.
298
        """
299
        cdef Py_ssize_t line_index
300
        cdef Py_ssize_t start
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
301
        cdef _raw_line *cur_line
302
        cdef _raw_line *next_line
303
        cdef Py_ssize_t hash_offset
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
304
        cdef _hash_bucket *cur_bucket
305
306
        start = self._len_lines - count
307
308
        for line_index from start <= line_index < self._len_lines:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
309
            cur_line = self._raw_lines + line_index
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
310
            if cur_line.flags & INDEXED != INDEXED:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
311
                continue
312
            hash_offset = self._find_hash_position(cur_line)
313
314
            # Point this line to the location in the hash table
315
            cur_line.hash_offset = hash_offset
316
317
            # Make this line the tail of the hash table
318
            cur_bucket = self._hashtable + hash_offset
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
319
            cur_bucket.count = cur_bucket.count + 1
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
320
            if cur_bucket.line_index == SENTINEL:
321
                cur_bucket.line_index = line_index
322
                continue
323
            # We need to track through the pointers and insert this at
324
            # the end
325
            next_line = self._raw_lines + cur_bucket.line_index
326
            while next_line.next_line_index != SENTINEL:
327
                next_line = self._raw_lines + next_line.next_line_index
328
            next_line.next_line_index = line_index
329
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
330
    cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1:
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
331
        """Find the node in the hash which defines this line.
332
333
        Each bucket in the hash table points at exactly 1 equivalent line. If 2
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
334
        objects would collide, we just increment to the next bucket until we
335
        get to an empty bucket that is either empty or exactly matches this
336
        object.
337
338
        :return: The offset in the hash table for this entry
339
        """
340
        cdef Py_ssize_t location
341
        cdef _raw_line *ref_line
342
        cdef Py_ssize_t ref_index
343
        cdef int compare_result
344
345
        location = line.hash & self._hashtable_bitmask
346
        ref_index = self._hashtable[location].line_index
347
        while ref_index != SENTINEL:
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
348
            ref_line = self._raw_lines + ref_index
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
349
            if (ref_line.hash == line.hash):
350
                PyObject_Cmp(ref_line.data, line.data, &compare_result)
351
                if compare_result == 0:
352
                    break
353
            location = (location + 1) & self._hashtable_bitmask
354
            ref_index = self._hashtable[location].line_index
355
        return location
356
357
    def _py_find_hash_position(self, line):
358
        """Used only for testing.
359
360
        Return the location where this fits in the hash table
361
        """
362
        cdef _raw_line raw_line
363
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
364
        self._line_to_raw_line(<PyObject *>line, &raw_line)
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
365
        return self._find_hash_position(&raw_line)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
366
        
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
367
    def _inspect_left_lines(self):
368
        """Used only for testing.
369
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
370
        :return: None if _raw_lines is NULL,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
371
            else [(object, hash_val, hash_loc, next_val)] for each node in raw
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
372
                  lines.
373
        """
374
        cdef Py_ssize_t i
375
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
376
        if self._raw_lines == NULL:
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
377
            return None
378
379
        result = []
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
380
        for i from 0 <= i < self._len_lines:
381
            PyList_Append(result,
382
                          (<object>self._raw_lines[i].data,
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
383
                           self._raw_lines[i].hash,
384
                           self._raw_lines[i].hash_offset,
385
                           self._raw_lines[i].next_line_index,
0.18.18 by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result.
386
                           ))
0.18.17 by John Arbash Meinel
We now build the appropriate hash table entries.
387
        return result
388
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
389
    def _inspect_hash_table(self):
390
        """Used only for testing.
391
392
        This iterates the hash table, and returns 'interesting' entries.
393
        :return: (total_size, [(offset, line_index, count)] for entries that
394
            are not empty.
395
        """
396
        cdef int i
397
398
        interesting = []
399
        for i from 0 <= i < self._hashtable_size:
400
            if self._hashtable[i].line_index != SENTINEL:
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
401
                PyList_Append(interesting,
402
                              (i, self._hashtable[i].line_index,
403
                               self._hashtable[i].count))
0.18.15 by John Arbash Meinel
Start writing tests directly for the compiled class
404
        return (self._hashtable_size, interesting)
0.18.14 by John Arbash Meinel
A bit more work, not really usable yet.
405
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
406
    def get_matches(self, line):
407
        """Return the lines which match the line in right."""
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
408
        cdef Py_ssize_t count
409
        cdef Py_ssize_t i
410
        cdef Py_ssize_t *matches
411
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
412
        matches = NULL
413
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
414
        try:
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
415
            count = self.get_raw_matches(<PyObject *>line, &matches)
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
416
            if count == 0:
417
                return None
418
            result = []
419
            for i from 0 <= i < count:
420
                PyList_Append(result, matches[i])
421
        finally:
422
            safe_free(<void**>&matches)
423
        return result
424
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
425
    cdef Py_ssize_t get_raw_matches(self, PyObject *line,
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
426
                                    Py_ssize_t **matches) except -1:
427
        """Get all the possible entries that match the given location.
428
429
        :param line: A Python line which we want to match
430
        :param pos: The index position in _right_lines
431
        :param matches: A variable to hold an array containing all the matches.
432
            This will be allocated using malloc, and the caller is responsible
433
            to free it with free(). If there are no matches, no memory will be
434
            allocated.
435
        :retrun: The number of matched positions
436
        """
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
437
        cdef Py_ssize_t hash_offset
438
        cdef _raw_line raw_line
439
        cdef _hash_bucket cur_bucket
440
        cdef Py_ssize_t cur_line_idx
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
441
        cdef Py_ssize_t count
442
        cdef Py_ssize_t *local_matches
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
443
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
444
        self._line_to_raw_line(line, &raw_line)
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
445
        hash_offset = self._find_hash_position(&raw_line)
446
        cur_bucket = self._hashtable[hash_offset]
447
        cur_line_idx = cur_bucket.line_index
448
        if cur_line_idx == SENTINEL:
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
449
            return 0
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
450
        local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) *
451
                                                 (cur_bucket.count + 1))
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
452
        matches[0] = local_matches
453
        for count from 0 <= count < cur_bucket.count:
454
            assert cur_line_idx != SENTINEL
455
            local_matches[count] = cur_line_idx
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
456
            cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
457
        return cur_bucket.count
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
458
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
459
    cdef Py_ssize_t get_next_matches(self, PyObject *line,
460
                                     Py_ssize_t *tips,
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
461
                                     Py_ssize_t tip_count,
462
                                     int *did_match) except -1:
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
463
        """Find matches that are interesting for the next step.
464
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
465
        This finds offsets which match the current search tips. It is slightly
466
        different from get_raw_matches, in that it intersects matching lines
467
        against the existing tips, and then returns the *next* line.
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
468
469
        TODO: It might be nice to know if there were 0 matches because the line
470
              didn't match anything, or if it was just because it didn't follow
471
              anything.
472
473
        :param line: A line to match against
474
        :param tips: A list of matches that we already have. This list
475
            will be updated "in place". If there are no new matches, the list
476
            will go unmodified, and the returned count will be 0.
477
        :param tip_count: The current length of tips
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
478
        :param did_match: Will be set to True if the line had matches, else 0
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
479
        :return: The new number of matched lines.
480
        """
481
        cdef Py_ssize_t hash_offset
482
        cdef _raw_line raw_line
483
        cdef _hash_bucket cur_bucket
484
        cdef Py_ssize_t cur_line_idx
485
        cdef Py_ssize_t count
486
        cdef Py_ssize_t *cur_tip
487
        cdef Py_ssize_t *end_tip
488
        cdef Py_ssize_t *cur_out
489
490
        self._line_to_raw_line(line, &raw_line)
491
        hash_offset = self._find_hash_position(&raw_line)
492
        cur_bucket = self._hashtable[hash_offset]
493
        cur_line_idx = cur_bucket.line_index
494
        if cur_line_idx == SENTINEL:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
495
            did_match[0] = 0
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
496
            return 0
497
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
498
        did_match[0] = 1
499
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
500
        cur_tip = tips
501
        end_tip = tips + tip_count
502
        cur_out = tips
503
        count = 0
504
        while cur_tip < end_tip and cur_line_idx != SENTINEL:
505
            if cur_line_idx == cur_tip[0]:
506
                # These match, so put a new entry in the output
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
507
                # We put the *next* line, so that this is ready to pass back in
508
                # for the next search.
509
                cur_out[0] = (cur_line_idx + 1)
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
510
                count = count + 1
511
                cur_out = cur_out + 1
512
                cur_tip = cur_tip + 1
513
                cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
514
            elif cur_line_idx < cur_tip[0]:
515
                # cur_line_idx is "behind"
516
                cur_line_idx = self._raw_lines[cur_line_idx].next_line_index
517
            else:
518
                cur_tip = cur_tip + 1
519
        return count
520
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
521
    def _get_matching_lines(self):
522
        """Return a dictionary showing matching lines."""
523
        matching = {}
524
        for line in self.lines:
525
            matching[line] = self.get_matches(line)
526
        return matching
527
528
    def get_idx_matches(self, right_idx):
529
        """Return the left lines matching the right line at the given offset."""
530
        line = self._right_lines[right_idx]
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
531
        return self.get_matches(line)
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
532
533
    def extend_lines(self, lines, index):
534
        """Add more lines to the left-lines list.
535
536
        :param lines: A list of lines to add
537
        :param index: A True/False for each node to define if it should be
538
            indexed.
539
        """
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
540
        cdef Py_ssize_t orig_len
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
541
        cdef Py_ssize_t min_new_hash_size
542
        assert len(lines) == len(index)
543
        min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) +
544
                                                            len(lines))
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
545
        orig_len = len(self.lines)
546
        self.lines.extend(lines)
547
        self._extend_raw_lines(index)
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
548
        if self._hashtable_size >= min_new_hash_size:
549
            # Just add the new lines, don't bother resizing the hash table
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
550
            self._extend_hash_table(len(index))
0.18.23 by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash
551
            return
0.18.21 by John Arbash Meinel
Add support for not including certain lines in the hashtable.
552
        self._build_hash_table()
0.18.13 by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test.
553
554
    def set_right_lines(self, lines):
555
        """Set the lines we will be matching against."""
556
        self._right_lines = lines
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
557
558
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
559
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count,
560
                                 Py_ssize_t *new_values, Py_ssize_t new_count) except -1:
561
    """Intersect the base values with the new values.
562
563
    It is assumed that both base and new_values are sorted in ascending order,
564
    with no duplicates.
565
566
    The values in 'base' will be modified to only contain the entries that are
567
    in both base and new. If nothing matches, 'base' will not be modified
568
569
    :param base: An array of base values
570
    :param base_count: The length of the base array
571
    :param new_values: An array of values to compare to base, will be modified
572
        to only contain matching entries
573
    :param new_count: The number of new entries
574
    :return: The count of matching values
575
    """
576
    cdef Py_ssize_t *cur_base
577
    cdef Py_ssize_t *cur_new
578
    cdef Py_ssize_t *cur_out
579
    cdef Py_ssize_t count
580
    cdef Py_ssize_t *base_end
581
    cdef Py_ssize_t *new_end
582
583
    cur_base = base
584
    cur_out = base
585
    cur_new = new_values
586
    base_end = base + base_count
587
    new_end = new_values + new_count
588
    count = 0
589
590
    while cur_base < base_end and cur_new < new_end:
591
        if cur_base[0] == cur_new[0]:
592
            # This may be assigning to self.
593
            cur_out[0] = cur_base[0]
0.18.32 by John Arbash Meinel
Switch away from += for older versions of pyrex,
594
            count = count + 1
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
595
            cur_out = cur_out + 1
596
            # We matched, so move both pointers
597
            cur_base = cur_base + 1
598
            cur_new = cur_new + 1
599
        elif cur_base[0] < cur_new[0]:
600
            # cur_base is "behind" so increment it
601
            cur_base = cur_base + 1
602
        else:
603
            # cur_new must be "behind" move its pointer
604
            cur_new = cur_new + 1
605
    return count
606
607
608
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count):
609
    cdef int i
610
    lst = []
611
    for i from 0 <= i < count:
0.18.33 by John Arbash Meinel
Use the cached match from the previous run, drops time from 2.4s => 2.0s on inventory.py
612
        PyList_Append(lst, array[i])
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
613
    return lst
614
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
615
0.18.33 by John Arbash Meinel
Use the cached match from the previous run, drops time from 2.4s => 2.0s on inventory.py
616
cdef Py_ssize_t list_as_array(object lst, Py_ssize_t **array) except -1:
617
    """Convert a Python sequence to an integer array.
618
619
    :return: The size of the output, -1 on error
620
    """
621
    cdef int i
622
    cdef PyObject *seq
623
    cdef Py_ssize_t size
624
625
    seq = PySequence_Fast(lst, "expected a sequence for lst")
626
    try:
627
        size = PySequence_Fast_GET_SIZE(seq)
628
        array[0] = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * size)
629
        for i from 0 <= i < size:
630
            array[0][i] = <object>PySequence_Fast_GET_ITEM(seq, i)
631
    finally:
632
        Py_DECREF(seq)
633
    return size
634
635
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
636
def _get_longest_match(equivalence_table, pos, max_pos, locations):
637
    """Get the longest possible match for the current position."""
638
    cdef EquivalenceTable eq
0.18.28 by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object
639
    cdef Py_ssize_t cpos
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
640
    cdef Py_ssize_t cmax_pos
641
    cdef Py_ssize_t *matches
642
    cdef Py_ssize_t match_count
643
    cdef PyObject *line
644
    cdef Py_ssize_t *copy_ends
645
    cdef Py_ssize_t copy_count
646
    cdef Py_ssize_t range_len
647
    cdef Py_ssize_t in_start
648
    cdef Py_ssize_t i
649
    cdef Py_ssize_t intersect_count
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
650
    cdef int did_match
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
651
652
    eq = equivalence_table
653
    cpos = pos
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
654
    cmax_pos = max_pos
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
655
    range_start = pos
656
    range_len = 0
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
657
    matches = NULL
658
    match_count = 0
659
    copy_ends = NULL
660
    copy_count = 0
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
661
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
662
    # TODO: Handle when locations is not None
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
663
    # if locations is not None:
664
    #     copy_count = list_as_array(locations, &copy_ends)
665
    #     # We know the match for this position
666
    #     cpos = cpos + 1
667
    #     range_len = 1
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
668
    while cpos < cmax_pos:
0.18.31 by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries.
669
        # TODO: Instead of grabbing the full list of matches and then doing an
670
        #       intersection, use a helper that does the intersection
671
        #       as-it-goes.
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
672
        line = PyList_GET_ITEM(eq._right_lines, cpos)
673
        if copy_ends == NULL:
674
            copy_count = eq.get_raw_matches(line, &copy_ends)
675
            if copy_count == 0:
676
                # No matches for this line
677
                cpos = cpos + 1
678
                break
679
            # point at the next location, for the next round trip
680
            range_len = 1
681
            for i from 0 <= i < copy_count:
682
                copy_ends[i] = copy_ends[i] + 1
0.18.26 by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function.
683
        else:
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
684
            # We are in the middle of a match
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
685
            intersect_count = eq.get_next_matches(line, copy_ends, copy_count,
686
                                                  &did_match)
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
687
            if intersect_count == 0:
0.18.36 by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise.
688
                # No more matches
689
                if not did_match:
690
                    # because the next line is not interesting
691
                    cpos = cpos + 1
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
692
                break
693
            else:
694
                copy_count = intersect_count
695
                range_len = range_len + 1
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
696
        cpos = cpos + 1
697
    if copy_ends == NULL or copy_count == 0:
698
        # We never had a match
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
699
        return None, cpos, None
0.18.29 by John Arbash Meinel
Implement _get_longest_match in Pyrex.
700
    # Because copy_ends is always sorted, we know the 'min' is just the first
701
    # node
702
    in_start = copy_ends[0]
703
    safe_free(<void**>&copy_ends)
0.18.34 by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more.
704
    return ((in_start - range_len), range_start, range_len), cpos, None