bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
1 |
# Copyright (C) 2008 Canonical Limited.
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License version 2 as published
|
|
5 |
# by the Free Software Foundation.
|
|
6 |
#
|
|
7 |
# This program is distributed in the hope that it will be useful,
|
|
8 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
# GNU General Public License for more details.
|
|
11 |
#
|
|
12 |
# You should have received a copy of the GNU General Public License
|
|
13 |
# along with this program; if not, write to the Free Software
|
|
14 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
15 |
#
|
|
16 |
||
17 |
"""Compiled extensions for doing compression."""
|
|
18 |
||
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
19 |
cdef extern from *: |
20 |
ctypedef unsigned long size_t |
|
21 |
void * malloc(size_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
22 |
void * realloc(void *, size_t) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
23 |
void free(void *) |
24 |
||
25 |
cdef extern from "Python.h": |
|
26 |
struct _PyObject: |
|
27 |
pass
|
|
28 |
ctypedef _PyObject PyObject |
|
29 |
PyObject *PySequence_Fast(object, char *) except NULL |
|
30 |
Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *) |
|
31 |
PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
32 |
PyObject *PyList_GET_ITEM(object, Py_ssize_t) |
33 |
int PyList_Append(object, object) except -1 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
34 |
long PyObject_Hash(PyObject *) except -1 |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
35 |
# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
|
36 |
# if there is an exception *for* us.
|
|
37 |
int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1 |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
38 |
int PyObject_Not(PyObject *) except -1 |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
39 |
void Py_DECREF(PyObject *) |
40 |
void Py_INCREF(PyObject *) |
|
41 |
||
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
42 |
cdef enum _raw_line_flags: |
43 |
INDEXED = 0x01 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
44 |
|
45 |
cdef struct _raw_line: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
46 |
long hash # Cached form of the hash for this entry |
47 |
Py_ssize_t hash_offset # The location in the hash table for this object |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
48 |
Py_ssize_t next_line_index # Next line which is equivalent to this one |
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
49 |
int flags # status flags |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
50 |
PyObject *data # Raw pointer to the original line |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
51 |
|
52 |
||
53 |
cdef struct _hash_bucket: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
54 |
Py_ssize_t line_index # First line in the left side for this bucket |
55 |
Py_ssize_t count # Number of equivalent lines, DO we even need this? |
|
56 |
||
57 |
||
58 |
cdef Py_ssize_t SENTINEL |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
59 |
SENTINEL = -1 |
60 |
||
61 |
||
62 |
cdef void *safe_malloc(size_t count) except NULL: |
|
63 |
cdef void *result |
|
64 |
result = malloc(count) |
|
65 |
if result == NULL: |
|
66 |
raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) |
|
67 |
return result |
|
68 |
||
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
69 |
|
70 |
cdef void *safe_realloc(void * old, size_t count) except NULL: |
|
71 |
cdef void *result |
|
72 |
result = realloc(old, count) |
|
73 |
if result == NULL: |
|
74 |
raise MemoryError('Failed to reallocate to %d bytes of memory' |
|
75 |
% (count,)) |
|
76 |
return result |
|
77 |
||
78 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
79 |
cdef int safe_free(void **val) except -1: |
80 |
assert val != NULL |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
81 |
if val[0] != NULL: |
82 |
free(val[0]) |
|
83 |
val[0] = NULL |
|
84 |
||
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
85 |
|
86 |
cdef class EquivalenceTable: |
|
87 |
"""This tracks equivalencies between lists of hashable objects. |
|
88 |
||
89 |
:ivar lines: The 'static' lines that will be preserved between runs.
|
|
90 |
"""
|
|
91 |
||
92 |
cdef readonly object lines |
|
93 |
cdef readonly object _right_lines |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
94 |
cdef Py_ssize_t _hashtable_size |
95 |
cdef Py_ssize_t _hashtable_bitmask |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
96 |
cdef _hash_bucket *_hashtable |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
97 |
cdef _raw_line *_raw_lines |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
98 |
cdef Py_ssize_t _len_lines |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
99 |
|
100 |
def __init__(self, lines): |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
101 |
self.lines = list(lines) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
102 |
self._right_lines = None |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
103 |
self._hashtable_size = 0 |
104 |
self._hashtable = NULL |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
105 |
self._raw_lines = NULL |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
106 |
self._len_lines = 0 |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
107 |
self._lines_to_raw_lines(lines) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
108 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
109 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
110 |
def __dealloc__(self): |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
111 |
safe_free(<void**>&self._hashtable) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
112 |
safe_free(<void**>&self._raw_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
113 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
114 |
cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1: |
115 |
"""Convert a single PyObject into a raw line.""" |
|
116 |
raw_line.hash = PyObject_Hash(line) |
|
117 |
raw_line.next_line_index = SENTINEL |
|
118 |
raw_line.hash_offset = SENTINEL |
|
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
119 |
raw_line.flags = INDEXED |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
120 |
raw_line.data = line |
121 |
||
122 |
cdef int _lines_to_raw_lines(self, object lines) except -1: |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
123 |
"""Load a sequence of objects into the _raw_line format""" |
124 |
cdef Py_ssize_t count, i |
|
125 |
cdef PyObject *seq |
|
126 |
cdef PyObject *item |
|
127 |
cdef _raw_line *raw |
|
128 |
||
129 |
# Do we want to use PySequence_Fast, or just assume that it is a list
|
|
130 |
# Also, do we need to decref the return value?
|
|
131 |
# http://www.python.org/doc/current/api/sequence.html
|
|
132 |
seq = PySequence_Fast(lines, "expected a sequence") |
|
133 |
try: |
|
134 |
count = PySequence_Fast_GET_SIZE(seq) |
|
135 |
if count == 0: |
|
136 |
return 0 |
|
137 |
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line)) |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
138 |
safe_free(<void**>&self._raw_lines) |
139 |
self._raw_lines = raw |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
140 |
self._len_lines = count |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
141 |
|
142 |
for i from 0 <= i < count: |
|
143 |
item = PySequence_Fast_GET_ITEM(seq, i) |
|
144 |
# NB: We don't Py_INCREF the data pointer, because we know we
|
|
145 |
# maintain a pointer to the item in self.lines or
|
|
146 |
# self._right_lines
|
|
147 |
# TODO: Do we even need to track a data pointer here? It would
|
|
148 |
# be less memory, and we *could* just look it up in the
|
|
149 |
# appropriate line list.
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
150 |
self._line_to_raw_line(item, &raw[i]) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
151 |
finally: |
152 |
# TODO: Unfortunately, try/finally always generates a compiler
|
|
153 |
# warning about a possibly unused variable :(
|
|
154 |
Py_DECREF(seq) |
|
155 |
return count |
|
156 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
157 |
cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed): |
158 |
"""Determine the smallest hash size that can reasonably fit the data. |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
159 |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
160 |
:param needed: The number of entries we might be inserting into
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
161 |
the hash (assuming there are no duplicate lines.)
|
162 |
:return: The number of hash table entries to use. This will always be a
|
|
163 |
power of two.
|
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
164 |
"""
|
165 |
cdef Py_ssize_t hash_size |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
166 |
cdef Py_ssize_t min_size |
167 |
||
168 |
# TODO: Another alternative would be to actually count how full the
|
|
169 |
# hash-table is, and decide if we need to grow it based on
|
|
170 |
# density. That can take into account duplicated lines. Though if
|
|
171 |
# we compress well, there should be a minimal amount of
|
|
172 |
# duplicated lines in the output.
|
|
173 |
||
174 |
# At the bare minimum, we could fit all entries into a 'needed'
|
|
175 |
# size hash table. However, any collision would then have a long way to
|
|
176 |
# traverse before it could find a 'free' slot.
|
|
177 |
# So we set the minimum size to give us 33% empty slots.
|
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
178 |
min_size = <Py_ssize_t>(needed * 2) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
179 |
hash_size = 1 |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
180 |
while hash_size < min_size: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
181 |
hash_size = hash_size << 1 |
182 |
return hash_size |
|
183 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
184 |
def _py_compute_minimum_hash_size(self, needed): |
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
185 |
"""Expose _compute_minimum_hash_size to python for testing.""" |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
186 |
return self._compute_minimum_hash_size(needed) |
187 |
||
188 |
cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed): |
|
189 |
"""Determine a reasonable hash size, assuming some room for growth. |
|
190 |
|
|
191 |
:param needed: The number of entries we might be inserting into
|
|
192 |
the hash (assuming there are no duplicate lines.)
|
|
193 |
:return: The number of hash table entries to use. This will always be a
|
|
194 |
power of two.
|
|
195 |
"""
|
|
196 |
cdef Py_ssize_t hash_size |
|
197 |
cdef Py_ssize_t min_size |
|
198 |
||
199 |
# We start off with a 8k hash (after doubling), because there isn't a
|
|
200 |
# lot of reason to go smaller than that (this class isn't one you'll be
|
|
201 |
# instantiating thousands of, and you are always likely to grow here.)
|
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
202 |
hash_size = 2048 |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
203 |
while hash_size < needed: |
204 |
hash_size = hash_size << 1 |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
205 |
# And we always give at least 4x blank space
|
206 |
hash_size = hash_size << 2 |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
207 |
return hash_size |
208 |
||
209 |
def _py_compute_recommended_hash_size(self, needed): |
|
210 |
"""Expose _compute_recommended_hash_size to python for testing.""" |
|
211 |
return self._compute_recommended_hash_size(needed) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
212 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
213 |
cdef int _build_hash_table(self) except -1: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
214 |
"""Build the hash table 'from scratch'.""" |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
215 |
cdef Py_ssize_t hash_size |
216 |
cdef Py_ssize_t hash_bitmask |
|
217 |
cdef Py_ssize_t i |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
218 |
cdef _raw_line *cur_line |
219 |
cdef Py_ssize_t hash_offset |
|
220 |
cdef _hash_bucket *cur_bucket |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
221 |
cdef _hash_bucket *new_hashtable |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
222 |
|
223 |
# Hash size is a power of 2
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
224 |
hash_size = self._compute_recommended_hash_size(self._len_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
225 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
226 |
new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) * |
227 |
hash_size) |
|
228 |
safe_free(<void**>&self._hashtable) |
|
229 |
self._hashtable = new_hashtable |
|
230 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
231 |
self._hashtable_size = hash_size |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
232 |
for i from 0 <= i < hash_size: |
233 |
self._hashtable[i].line_index = SENTINEL |
|
234 |
self._hashtable[i].count = 0 |
|
235 |
||
236 |
# Turn the hash size into a bitmask
|
|
237 |
self._hashtable_bitmask = hash_size - 1 |
|
238 |
||
239 |
# Iterate backwards, because it makes it easier to insert items into
|
|
240 |
# the hash (you just change the head pointer, and everything else keeps
|
|
241 |
# pointing to the same location).
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
242 |
for i from self._len_lines > i >= 0: |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
243 |
cur_line = self._raw_lines + i |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
244 |
if not (cur_line.flags & INDEXED == INDEXED): |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
245 |
continue
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
246 |
hash_offset = self._find_hash_position(cur_line) |
247 |
||
248 |
# Point this line to the location in the hash table
|
|
249 |
cur_line.hash_offset = hash_offset |
|
250 |
# And make this line the head of the hash table
|
|
251 |
cur_bucket = self._hashtable + hash_offset |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
252 |
cur_line.next_line_index = cur_bucket.line_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
253 |
cur_bucket.line_index = i |
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
254 |
cur_bucket.count = cur_bucket.count + 1 |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
255 |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
256 |
cdef int _extend_raw_lines(self, object index) except -1: |
257 |
"""Add the last N lines from self.lines into the raw_lines array.""" |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
258 |
cdef Py_ssize_t new_count |
259 |
cdef Py_ssize_t new_total_len |
|
260 |
cdef Py_ssize_t old_len |
|
261 |
cdef PyObject *item |
|
262 |
cdef PyObject *should_index |
|
263 |
cdef Py_ssize_t i |
|
264 |
cdef Py_ssize_t line_index |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
265 |
cdef _raw_line *cur_line |
266 |
cdef PyObject *seq_index |
|
|
0.18.37
by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc. |
267 |
cdef Py_ssize_t actual_new_len |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
268 |
|
269 |
seq_index = PySequence_Fast(index, "expected a sequence for index") |
|
270 |
try: |
|
271 |
old_len = self._len_lines |
|
272 |
new_count = PySequence_Fast_GET_SIZE(seq_index) |
|
273 |
new_total_len = new_count + self._len_lines |
|
|
0.18.37
by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc. |
274 |
actual_new_len = 1 |
275 |
while actual_new_len < new_total_len: |
|
276 |
actual_new_len = actual_new_len << 1 |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
277 |
self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines, |
|
0.18.37
by John Arbash Meinel
It turns out that appending a short set of lines every time was killing performance in realloc. |
278 |
actual_new_len * sizeof(_raw_line)) |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
279 |
self._len_lines = new_total_len |
280 |
# Now that we have enough space, start adding the new lines
|
|
281 |
# into the array. These are done in forward order.
|
|
282 |
for i from 0 <= i < new_count: |
|
283 |
line_index = i + old_len |
|
284 |
cur_line = self._raw_lines + line_index |
|
285 |
item = PyList_GET_ITEM(self.lines, line_index) |
|
286 |
self._line_to_raw_line(item, cur_line) |
|
287 |
should_index = PySequence_Fast_GET_ITEM(seq_index, i) |
|
288 |
if PyObject_Not(should_index): |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
289 |
cur_line.flags = cur_line.flags & ~(<int>INDEXED) |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
290 |
finally: |
291 |
Py_DECREF(seq_index) |
|
292 |
||
293 |
cdef int _extend_hash_table(self, Py_ssize_t count) except -1: |
|
294 |
"""Add the last N entries in self.lines to the hash table. |
|
295 |
||
296 |
:param index: A sequence that declares whether each node should be
|
|
297 |
INDEXED or not.
|
|
298 |
"""
|
|
299 |
cdef Py_ssize_t line_index |
|
300 |
cdef Py_ssize_t start |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
301 |
cdef _raw_line *cur_line |
302 |
cdef _raw_line *next_line |
|
303 |
cdef Py_ssize_t hash_offset |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
304 |
cdef _hash_bucket *cur_bucket |
305 |
||
306 |
start = self._len_lines - count |
|
307 |
||
308 |
for line_index from start <= line_index < self._len_lines: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
309 |
cur_line = self._raw_lines + line_index |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
310 |
if cur_line.flags & INDEXED != INDEXED: |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
311 |
continue
|
312 |
hash_offset = self._find_hash_position(cur_line) |
|
313 |
||
314 |
# Point this line to the location in the hash table
|
|
315 |
cur_line.hash_offset = hash_offset |
|
316 |
||
317 |
# Make this line the tail of the hash table
|
|
318 |
cur_bucket = self._hashtable + hash_offset |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
319 |
cur_bucket.count = cur_bucket.count + 1 |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
320 |
if cur_bucket.line_index == SENTINEL: |
321 |
cur_bucket.line_index = line_index |
|
322 |
continue
|
|
323 |
# We need to track through the pointers and insert this at
|
|
324 |
# the end
|
|
325 |
next_line = self._raw_lines + cur_bucket.line_index |
|
326 |
while next_line.next_line_index != SENTINEL: |
|
327 |
next_line = self._raw_lines + next_line.next_line_index |
|
328 |
next_line.next_line_index = line_index |
|
329 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
330 |
cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
331 |
"""Find the node in the hash which defines this line. |
332 |
||
333 |
Each bucket in the hash table points at exactly 1 equivalent line. If 2
|
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
334 |
objects would collide, we just increment to the next bucket until we
|
335 |
get to an empty bucket that is either empty or exactly matches this
|
|
336 |
object.
|
|
337 |
||
338 |
:return: The offset in the hash table for this entry
|
|
339 |
"""
|
|
340 |
cdef Py_ssize_t location |
|
341 |
cdef _raw_line *ref_line |
|
342 |
cdef Py_ssize_t ref_index |
|
343 |
cdef int compare_result |
|
344 |
||
345 |
location = line.hash & self._hashtable_bitmask |
|
346 |
ref_index = self._hashtable[location].line_index |
|
347 |
while ref_index != SENTINEL: |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
348 |
ref_line = self._raw_lines + ref_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
349 |
if (ref_line.hash == line.hash): |
350 |
PyObject_Cmp(ref_line.data, line.data, &compare_result) |
|
351 |
if compare_result == 0: |
|
352 |
break
|
|
353 |
location = (location + 1) & self._hashtable_bitmask |
|
354 |
ref_index = self._hashtable[location].line_index |
|
355 |
return location |
|
356 |
||
357 |
def _py_find_hash_position(self, line): |
|
358 |
"""Used only for testing. |
|
359 |
||
360 |
Return the location where this fits in the hash table
|
|
361 |
"""
|
|
362 |
cdef _raw_line raw_line |
|
363 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
364 |
self._line_to_raw_line(<PyObject *>line, &raw_line) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
365 |
return self._find_hash_position(&raw_line) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
366 |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
367 |
def _inspect_left_lines(self): |
368 |
"""Used only for testing. |
|
369 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
370 |
:return: None if _raw_lines is NULL,
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
371 |
else [(object, hash_val, hash_loc, next_val)] for each node in raw
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
372 |
lines.
|
373 |
"""
|
|
374 |
cdef Py_ssize_t i |
|
375 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
376 |
if self._raw_lines == NULL: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
377 |
return None |
378 |
||
379 |
result = [] |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
380 |
for i from 0 <= i < self._len_lines: |
381 |
PyList_Append(result, |
|
382 |
(<object>self._raw_lines[i].data, |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
383 |
self._raw_lines[i].hash, |
384 |
self._raw_lines[i].hash_offset, |
|
385 |
self._raw_lines[i].next_line_index, |
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
386 |
))
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
387 |
return result |
388 |
||
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
389 |
def _inspect_hash_table(self): |
390 |
"""Used only for testing. |
|
391 |
||
392 |
This iterates the hash table, and returns 'interesting' entries.
|
|
393 |
:return: (total_size, [(offset, line_index, count)] for entries that
|
|
394 |
are not empty.
|
|
395 |
"""
|
|
396 |
cdef int i |
|
397 |
||
398 |
interesting = [] |
|
399 |
for i from 0 <= i < self._hashtable_size: |
|
400 |
if self._hashtable[i].line_index != SENTINEL: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
401 |
PyList_Append(interesting, |
402 |
(i, self._hashtable[i].line_index, |
|
403 |
self._hashtable[i].count)) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
404 |
return (self._hashtable_size, interesting) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
405 |
|
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
406 |
def get_matches(self, line): |
407 |
"""Return the lines which match the line in right.""" |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
408 |
cdef Py_ssize_t count |
409 |
cdef Py_ssize_t i |
|
410 |
cdef Py_ssize_t *matches |
|
411 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
412 |
matches = NULL |
413 |
||
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
414 |
try: |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
415 |
count = self.get_raw_matches(<PyObject *>line, &matches) |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
416 |
if count == 0: |
417 |
return None |
|
418 |
result = [] |
|
419 |
for i from 0 <= i < count: |
|
420 |
PyList_Append(result, matches[i]) |
|
421 |
finally: |
|
422 |
safe_free(<void**>&matches) |
|
423 |
return result |
|
424 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
425 |
cdef Py_ssize_t get_raw_matches(self, PyObject *line, |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
426 |
Py_ssize_t **matches) except -1: |
427 |
"""Get all the possible entries that match the given location. |
|
428 |
||
429 |
:param line: A Python line which we want to match
|
|
430 |
:param pos: The index position in _right_lines
|
|
431 |
:param matches: A variable to hold an array containing all the matches.
|
|
432 |
This will be allocated using malloc, and the caller is responsible
|
|
433 |
to free it with free(). If there are no matches, no memory will be
|
|
434 |
allocated.
|
|
435 |
:retrun: The number of matched positions
|
|
436 |
"""
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
437 |
cdef Py_ssize_t hash_offset |
438 |
cdef _raw_line raw_line |
|
439 |
cdef _hash_bucket cur_bucket |
|
440 |
cdef Py_ssize_t cur_line_idx |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
441 |
cdef Py_ssize_t count |
442 |
cdef Py_ssize_t *local_matches |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
443 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
444 |
self._line_to_raw_line(line, &raw_line) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
445 |
hash_offset = self._find_hash_position(&raw_line) |
446 |
cur_bucket = self._hashtable[hash_offset] |
|
447 |
cur_line_idx = cur_bucket.line_index |
|
448 |
if cur_line_idx == SENTINEL: |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
449 |
return 0 |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
450 |
local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * |
451 |
(cur_bucket.count + 1)) |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
452 |
matches[0] = local_matches |
453 |
for count from 0 <= count < cur_bucket.count: |
|
454 |
assert cur_line_idx != SENTINEL |
|
455 |
local_matches[count] = cur_line_idx |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
456 |
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
457 |
return cur_bucket.count |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
458 |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
459 |
cdef Py_ssize_t get_next_matches(self, PyObject *line, |
460 |
Py_ssize_t *tips, |
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
461 |
Py_ssize_t tip_count, |
462 |
int *did_match) except -1: |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
463 |
"""Find matches that are interesting for the next step. |
464 |
||
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
465 |
This finds offsets which match the current search tips. It is slightly
|
466 |
different from get_raw_matches, in that it intersects matching lines
|
|
467 |
against the existing tips, and then returns the *next* line.
|
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
468 |
|
469 |
TODO: It might be nice to know if there were 0 matches because the line
|
|
470 |
didn't match anything, or if it was just because it didn't follow
|
|
471 |
anything.
|
|
472 |
||
473 |
:param line: A line to match against
|
|
474 |
:param tips: A list of matches that we already have. This list
|
|
475 |
will be updated "in place". If there are no new matches, the list
|
|
476 |
will go unmodified, and the returned count will be 0.
|
|
477 |
:param tip_count: The current length of tips
|
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
478 |
:param did_match: Will be set to True if the line had matches, else 0
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
479 |
:return: The new number of matched lines.
|
480 |
"""
|
|
481 |
cdef Py_ssize_t hash_offset |
|
482 |
cdef _raw_line raw_line |
|
483 |
cdef _hash_bucket cur_bucket |
|
484 |
cdef Py_ssize_t cur_line_idx |
|
485 |
cdef Py_ssize_t count |
|
486 |
cdef Py_ssize_t *cur_tip |
|
487 |
cdef Py_ssize_t *end_tip |
|
488 |
cdef Py_ssize_t *cur_out |
|
489 |
||
490 |
self._line_to_raw_line(line, &raw_line) |
|
491 |
hash_offset = self._find_hash_position(&raw_line) |
|
492 |
cur_bucket = self._hashtable[hash_offset] |
|
493 |
cur_line_idx = cur_bucket.line_index |
|
494 |
if cur_line_idx == SENTINEL: |
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
495 |
did_match[0] = 0 |
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
496 |
return 0 |
497 |
||
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
498 |
did_match[0] = 1 |
499 |
||
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
500 |
cur_tip = tips |
501 |
end_tip = tips + tip_count |
|
502 |
cur_out = tips |
|
503 |
count = 0 |
|
504 |
while cur_tip < end_tip and cur_line_idx != SENTINEL: |
|
505 |
if cur_line_idx == cur_tip[0]: |
|
506 |
# These match, so put a new entry in the output
|
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
507 |
# We put the *next* line, so that this is ready to pass back in
|
508 |
# for the next search.
|
|
509 |
cur_out[0] = (cur_line_idx + 1) |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
510 |
count = count + 1 |
511 |
cur_out = cur_out + 1 |
|
512 |
cur_tip = cur_tip + 1 |
|
513 |
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index |
|
514 |
elif cur_line_idx < cur_tip[0]: |
|
515 |
# cur_line_idx is "behind"
|
|
516 |
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index |
|
517 |
else: |
|
518 |
cur_tip = cur_tip + 1 |
|
519 |
return count |
|
520 |
||
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
521 |
def _get_matching_lines(self): |
522 |
"""Return a dictionary showing matching lines.""" |
|
523 |
matching = {} |
|
524 |
for line in self.lines: |
|
525 |
matching[line] = self.get_matches(line) |
|
526 |
return matching |
|
527 |
||
528 |
def get_idx_matches(self, right_idx): |
|
529 |
"""Return the left lines matching the right line at the given offset.""" |
|
530 |
line = self._right_lines[right_idx] |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
531 |
return self.get_matches(line) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
532 |
|
533 |
def extend_lines(self, lines, index): |
|
534 |
"""Add more lines to the left-lines list. |
|
535 |
||
536 |
:param lines: A list of lines to add
|
|
537 |
:param index: A True/False for each node to define if it should be
|
|
538 |
indexed.
|
|
539 |
"""
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
540 |
cdef Py_ssize_t orig_len |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
541 |
cdef Py_ssize_t min_new_hash_size |
542 |
assert len(lines) == len(index) |
|
543 |
min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) + |
|
544 |
len(lines)) |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
545 |
orig_len = len(self.lines) |
546 |
self.lines.extend(lines) |
|
547 |
self._extend_raw_lines(index) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
548 |
if self._hashtable_size >= min_new_hash_size: |
549 |
# Just add the new lines, don't bother resizing the hash table
|
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
550 |
self._extend_hash_table(len(index)) |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
551 |
return
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
552 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
553 |
|
554 |
def set_right_lines(self, lines): |
|
555 |
"""Set the lines we will be matching against.""" |
|
556 |
self._right_lines = lines |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
557 |
|
558 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
559 |
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count, |
560 |
Py_ssize_t *new_values, Py_ssize_t new_count) except -1: |
|
561 |
"""Intersect the base values with the new values. |
|
562 |
||
563 |
It is assumed that both base and new_values are sorted in ascending order,
|
|
564 |
with no duplicates.
|
|
565 |
||
566 |
The values in 'base' will be modified to only contain the entries that are
|
|
567 |
in both base and new. If nothing matches, 'base' will not be modified
|
|
568 |
||
569 |
:param base: An array of base values
|
|
570 |
:param base_count: The length of the base array
|
|
571 |
:param new_values: An array of values to compare to base, will be modified
|
|
572 |
to only contain matching entries
|
|
573 |
:param new_count: The number of new entries
|
|
574 |
:return: The count of matching values
|
|
575 |
"""
|
|
576 |
cdef Py_ssize_t *cur_base |
|
577 |
cdef Py_ssize_t *cur_new |
|
578 |
cdef Py_ssize_t *cur_out |
|
579 |
cdef Py_ssize_t count |
|
580 |
cdef Py_ssize_t *base_end |
|
581 |
cdef Py_ssize_t *new_end |
|
582 |
||
583 |
cur_base = base |
|
584 |
cur_out = base |
|
585 |
cur_new = new_values |
|
586 |
base_end = base + base_count |
|
587 |
new_end = new_values + new_count |
|
588 |
count = 0 |
|
589 |
||
590 |
while cur_base < base_end and cur_new < new_end: |
|
591 |
if cur_base[0] == cur_new[0]: |
|
592 |
# This may be assigning to self.
|
|
593 |
cur_out[0] = cur_base[0] |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
594 |
count = count + 1 |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
595 |
cur_out = cur_out + 1 |
596 |
# We matched, so move both pointers
|
|
597 |
cur_base = cur_base + 1 |
|
598 |
cur_new = cur_new + 1 |
|
599 |
elif cur_base[0] < cur_new[0]: |
|
600 |
# cur_base is "behind" so increment it
|
|
601 |
cur_base = cur_base + 1 |
|
602 |
else: |
|
603 |
# cur_new must be "behind" move its pointer
|
|
604 |
cur_new = cur_new + 1 |
|
605 |
return count |
|
606 |
||
607 |
||
608 |
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count): |
|
609 |
cdef int i |
|
610 |
lst = [] |
|
611 |
for i from 0 <= i < count: |
|
|
0.18.33
by John Arbash Meinel
Use the cached match from the previous run, drops time from 2.4s => 2.0s on inventory.py |
612 |
PyList_Append(lst, array[i]) |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
613 |
return lst |
614 |
||
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
615 |
|
|
0.18.33
by John Arbash Meinel
Use the cached match from the previous run, drops time from 2.4s => 2.0s on inventory.py |
616 |
cdef Py_ssize_t list_as_array(object lst, Py_ssize_t **array) except -1: |
617 |
"""Convert a Python sequence to an integer array. |
|
618 |
||
619 |
:return: The size of the output, -1 on error
|
|
620 |
"""
|
|
621 |
cdef int i |
|
622 |
cdef PyObject *seq |
|
623 |
cdef Py_ssize_t size |
|
624 |
||
625 |
seq = PySequence_Fast(lst, "expected a sequence for lst") |
|
626 |
try: |
|
627 |
size = PySequence_Fast_GET_SIZE(seq) |
|
628 |
array[0] = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * size) |
|
629 |
for i from 0 <= i < size: |
|
630 |
array[0][i] = <object>PySequence_Fast_GET_ITEM(seq, i) |
|
631 |
finally: |
|
632 |
Py_DECREF(seq) |
|
633 |
return size |
|
634 |
||
635 |
||
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
636 |
def _get_longest_match(equivalence_table, pos, max_pos, locations): |
637 |
"""Get the longest possible match for the current position.""" |
|
638 |
cdef EquivalenceTable eq |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
639 |
cdef Py_ssize_t cpos |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
640 |
cdef Py_ssize_t cmax_pos |
641 |
cdef Py_ssize_t *matches |
|
642 |
cdef Py_ssize_t match_count |
|
643 |
cdef PyObject *line |
|
644 |
cdef Py_ssize_t *copy_ends |
|
645 |
cdef Py_ssize_t copy_count |
|
646 |
cdef Py_ssize_t range_len |
|
647 |
cdef Py_ssize_t in_start |
|
648 |
cdef Py_ssize_t i |
|
649 |
cdef Py_ssize_t intersect_count |
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
650 |
cdef int did_match |
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
651 |
|
652 |
eq = equivalence_table |
|
653 |
cpos = pos |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
654 |
cmax_pos = max_pos |
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
655 |
range_start = pos |
656 |
range_len = 0 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
657 |
matches = NULL |
658 |
match_count = 0 |
|
659 |
copy_ends = NULL |
|
660 |
copy_count = 0 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
661 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
662 |
# TODO: Handle when locations is not None
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
663 |
# if locations is not None:
|
664 |
# copy_count = list_as_array(locations, ©_ends)
|
|
665 |
# # We know the match for this position
|
|
666 |
# cpos = cpos + 1
|
|
667 |
# range_len = 1
|
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
668 |
while cpos < cmax_pos: |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
669 |
# TODO: Instead of grabbing the full list of matches and then doing an
|
670 |
# intersection, use a helper that does the intersection
|
|
671 |
# as-it-goes.
|
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
672 |
line = PyList_GET_ITEM(eq._right_lines, cpos) |
673 |
if copy_ends == NULL: |
|
674 |
copy_count = eq.get_raw_matches(line, ©_ends) |
|
675 |
if copy_count == 0: |
|
676 |
# No matches for this line
|
|
677 |
cpos = cpos + 1 |
|
678 |
break
|
|
679 |
# point at the next location, for the next round trip
|
|
680 |
range_len = 1 |
|
681 |
for i from 0 <= i < copy_count: |
|
682 |
copy_ends[i] = copy_ends[i] + 1 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
683 |
else: |
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
684 |
# We are in the middle of a match
|
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
685 |
intersect_count = eq.get_next_matches(line, copy_ends, copy_count, |
686 |
&did_match) |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
687 |
if intersect_count == 0: |
|
0.18.36
by John Arbash Meinel
Small tweak makes a big difference on inventory.py, minor otherwise. |
688 |
# No more matches
|
689 |
if not did_match: |
|
690 |
# because the next line is not interesting
|
|
691 |
cpos = cpos + 1 |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
692 |
break
|
693 |
else: |
|
694 |
copy_count = intersect_count |
|
695 |
range_len = range_len + 1 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
696 |
cpos = cpos + 1 |
697 |
if copy_ends == NULL or copy_count == 0: |
|
698 |
# We never had a match
|
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
699 |
return None, cpos, None |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
700 |
# Because copy_ends is always sorted, we know the 'min' is just the first
|
701 |
# node
|
|
702 |
in_start = copy_ends[0] |
|
703 |
safe_free(<void**>©_ends) |
|
|
0.18.34
by John Arbash Meinel
Doing the intersection as we go allows us to short-cut a bit more. |
704 |
return ((in_start - range_len), range_start, range_len), cpos, None |