bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
1 |
# Copyright (C) 2008 Canonical Limited.
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License version 2 as published
|
|
5 |
# by the Free Software Foundation.
|
|
6 |
#
|
|
7 |
# This program is distributed in the hope that it will be useful,
|
|
8 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
# GNU General Public License for more details.
|
|
11 |
#
|
|
12 |
# You should have received a copy of the GNU General Public License
|
|
13 |
# along with this program; if not, write to the Free Software
|
|
14 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
15 |
#
|
|
16 |
||
17 |
"""Compiled extensions for doing compression."""
|
|
18 |
||
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
19 |
cdef extern from *: |
20 |
ctypedef unsigned long size_t |
|
21 |
void * malloc(size_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
22 |
void * realloc(void *, size_t) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
23 |
void free(void *) |
24 |
||
25 |
cdef extern from "Python.h": |
|
26 |
struct _PyObject: |
|
27 |
pass
|
|
28 |
ctypedef _PyObject PyObject |
|
29 |
PyObject *PySequence_Fast(object, char *) except NULL |
|
30 |
Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *) |
|
31 |
PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
32 |
PyObject *PyList_GET_ITEM(object, Py_ssize_t) |
33 |
int PyList_Append(object, object) except -1 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
34 |
long PyObject_Hash(PyObject *) except -1 |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
35 |
# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
|
36 |
# if there is an exception *for* us.
|
|
37 |
int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1 |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
38 |
int PyObject_Not(PyObject *) except -1 |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
39 |
void Py_DECREF(PyObject *) |
40 |
void Py_INCREF(PyObject *) |
|
41 |
||
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
42 |
cdef enum _raw_line_flags: |
43 |
INDEXED = 0x01 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
44 |
|
45 |
cdef struct _raw_line: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
46 |
long hash # Cached form of the hash for this entry |
47 |
Py_ssize_t hash_offset # The location in the hash table for this object |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
48 |
Py_ssize_t next_line_index # Next line which is equivalent to this one |
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
49 |
int flags # status flags |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
50 |
PyObject *data # Raw pointer to the original line |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
51 |
|
52 |
||
53 |
cdef struct _hash_bucket: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
54 |
Py_ssize_t line_index # First line in the left side for this bucket |
55 |
Py_ssize_t count # Number of equivalent lines, DO we even need this? |
|
56 |
||
57 |
||
58 |
cdef Py_ssize_t SENTINEL |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
59 |
SENTINEL = -1 |
60 |
||
61 |
||
62 |
cdef void *safe_malloc(size_t count) except NULL: |
|
63 |
cdef void *result |
|
64 |
result = malloc(count) |
|
65 |
if result == NULL: |
|
66 |
raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) |
|
67 |
return result |
|
68 |
||
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
69 |
|
70 |
cdef void *safe_realloc(void * old, size_t count) except NULL: |
|
71 |
cdef void *result |
|
72 |
result = realloc(old, count) |
|
73 |
if result == NULL: |
|
74 |
raise MemoryError('Failed to reallocate to %d bytes of memory' |
|
75 |
% (count,)) |
|
76 |
return result |
|
77 |
||
78 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
79 |
cdef int safe_free(void **val) except -1: |
80 |
assert val != NULL |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
81 |
if val[0] != NULL: |
82 |
free(val[0]) |
|
83 |
val[0] = NULL |
|
84 |
||
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
85 |
|
86 |
cdef class EquivalenceTable: |
|
87 |
"""This tracks equivalencies between lists of hashable objects. |
|
88 |
||
89 |
:ivar lines: The 'static' lines that will be preserved between runs.
|
|
90 |
"""
|
|
91 |
||
92 |
cdef readonly object lines |
|
93 |
cdef readonly object _right_lines |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
94 |
cdef Py_ssize_t _hashtable_size |
95 |
cdef Py_ssize_t _hashtable_bitmask |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
96 |
cdef _hash_bucket *_hashtable |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
97 |
cdef _raw_line *_raw_lines |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
98 |
cdef Py_ssize_t _len_lines |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
99 |
|
100 |
def __init__(self, lines): |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
101 |
self.lines = list(lines) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
102 |
self._right_lines = None |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
103 |
self._hashtable_size = 0 |
104 |
self._hashtable = NULL |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
105 |
self._raw_lines = NULL |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
106 |
self._len_lines = 0 |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
107 |
self._lines_to_raw_lines(lines) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
108 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
109 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
110 |
def __dealloc__(self): |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
111 |
safe_free(<void**>&self._hashtable) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
112 |
safe_free(<void**>&self._raw_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
113 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
114 |
cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1: |
115 |
"""Convert a single PyObject into a raw line.""" |
|
116 |
raw_line.hash = PyObject_Hash(line) |
|
117 |
raw_line.next_line_index = SENTINEL |
|
118 |
raw_line.hash_offset = SENTINEL |
|
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
119 |
raw_line.flags = INDEXED |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
120 |
raw_line.data = line |
121 |
||
122 |
cdef int _lines_to_raw_lines(self, object lines) except -1: |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
123 |
"""Load a sequence of objects into the _raw_line format""" |
124 |
cdef Py_ssize_t count, i |
|
125 |
cdef PyObject *seq |
|
126 |
cdef PyObject *item |
|
127 |
cdef _raw_line *raw |
|
128 |
||
129 |
# Do we want to use PySequence_Fast, or just assume that it is a list
|
|
130 |
# Also, do we need to decref the return value?
|
|
131 |
# http://www.python.org/doc/current/api/sequence.html
|
|
132 |
seq = PySequence_Fast(lines, "expected a sequence") |
|
133 |
try: |
|
134 |
count = PySequence_Fast_GET_SIZE(seq) |
|
135 |
if count == 0: |
|
136 |
return 0 |
|
137 |
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line)) |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
138 |
safe_free(<void**>&self._raw_lines) |
139 |
self._raw_lines = raw |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
140 |
self._len_lines = count |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
141 |
|
142 |
for i from 0 <= i < count: |
|
143 |
item = PySequence_Fast_GET_ITEM(seq, i) |
|
144 |
# NB: We don't Py_INCREF the data pointer, because we know we
|
|
145 |
# maintain a pointer to the item in self.lines or
|
|
146 |
# self._right_lines
|
|
147 |
# TODO: Do we even need to track a data pointer here? It would
|
|
148 |
# be less memory, and we *could* just look it up in the
|
|
149 |
# appropriate line list.
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
150 |
self._line_to_raw_line(item, &raw[i]) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
151 |
finally: |
152 |
# TODO: Unfortunately, try/finally always generates a compiler
|
|
153 |
# warning about a possibly unused variable :(
|
|
154 |
Py_DECREF(seq) |
|
155 |
return count |
|
156 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
157 |
cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed): |
158 |
"""Determine the smallest hash size that can reasonably fit the data. |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
159 |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
160 |
:param needed: The number of entries we might be inserting into
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
161 |
the hash (assuming there are no duplicate lines.)
|
162 |
:return: The number of hash table entries to use. This will always be a
|
|
163 |
power of two.
|
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
164 |
"""
|
165 |
cdef Py_ssize_t hash_size |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
166 |
cdef Py_ssize_t min_size |
167 |
||
168 |
# TODO: Another alternative would be to actually count how full the
|
|
169 |
# hash-table is, and decide if we need to grow it based on
|
|
170 |
# density. That can take into account duplicated lines. Though if
|
|
171 |
# we compress well, there should be a minimal amount of
|
|
172 |
# duplicated lines in the output.
|
|
173 |
||
174 |
# At the bare minimum, we could fit all entries into a 'needed'
|
|
175 |
# size hash table. However, any collision would then have a long way to
|
|
176 |
# traverse before it could find a 'free' slot.
|
|
177 |
# So we set the minimum size to give us 33% empty slots.
|
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
178 |
min_size = <Py_ssize_t>(needed * 1.5) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
179 |
hash_size = 1 |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
180 |
while hash_size < min_size: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
181 |
hash_size = hash_size << 1 |
182 |
return hash_size |
|
183 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
184 |
def _py_compute_minimum_hash_size(self, needed): |
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
185 |
"""Expose _compute_minimum_hash_size to python for testing.""" |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
186 |
return self._compute_minimum_hash_size(needed) |
187 |
||
188 |
cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed): |
|
189 |
"""Determine a reasonable hash size, assuming some room for growth. |
|
190 |
|
|
191 |
:param needed: The number of entries we might be inserting into
|
|
192 |
the hash (assuming there are no duplicate lines.)
|
|
193 |
:return: The number of hash table entries to use. This will always be a
|
|
194 |
power of two.
|
|
195 |
"""
|
|
196 |
cdef Py_ssize_t hash_size |
|
197 |
cdef Py_ssize_t min_size |
|
198 |
||
199 |
# We start off with a 8k hash (after doubling), because there isn't a
|
|
200 |
# lot of reason to go smaller than that (this class isn't one you'll be
|
|
201 |
# instantiating thousands of, and you are always likely to grow here.)
|
|
202 |
hash_size = 4096 |
|
203 |
while hash_size < needed: |
|
204 |
hash_size = hash_size << 1 |
|
205 |
# And we always give at least 2x blank space
|
|
206 |
hash_size = hash_size << 1 |
|
207 |
return hash_size |
|
208 |
||
209 |
def _py_compute_recommended_hash_size(self, needed): |
|
210 |
"""Expose _compute_recommended_hash_size to python for testing.""" |
|
211 |
return self._compute_recommended_hash_size(needed) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
212 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
213 |
cdef int _build_hash_table(self) except -1: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
214 |
"""Build the hash table 'from scratch'.""" |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
215 |
cdef Py_ssize_t hash_size |
216 |
cdef Py_ssize_t hash_bitmask |
|
217 |
cdef Py_ssize_t i |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
218 |
cdef _raw_line *cur_line |
219 |
cdef Py_ssize_t hash_offset |
|
220 |
cdef _hash_bucket *cur_bucket |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
221 |
cdef _hash_bucket *new_hashtable |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
222 |
|
223 |
# Hash size is a power of 2
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
224 |
hash_size = self._compute_recommended_hash_size(self._len_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
225 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
226 |
new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) * |
227 |
hash_size) |
|
228 |
safe_free(<void**>&self._hashtable) |
|
229 |
self._hashtable = new_hashtable |
|
230 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
231 |
self._hashtable_size = hash_size |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
232 |
for i from 0 <= i < hash_size: |
233 |
self._hashtable[i].line_index = SENTINEL |
|
234 |
self._hashtable[i].count = 0 |
|
235 |
||
236 |
# Turn the hash size into a bitmask
|
|
237 |
self._hashtable_bitmask = hash_size - 1 |
|
238 |
||
239 |
# Iterate backwards, because it makes it easier to insert items into
|
|
240 |
# the hash (you just change the head pointer, and everything else keeps
|
|
241 |
# pointing to the same location).
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
242 |
for i from self._len_lines > i >= 0: |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
243 |
cur_line = self._raw_lines + i |
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
244 |
if not (cur_line.flags & INDEXED): |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
245 |
continue
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
246 |
hash_offset = self._find_hash_position(cur_line) |
247 |
||
248 |
# Point this line to the location in the hash table
|
|
249 |
cur_line.hash_offset = hash_offset |
|
250 |
# And make this line the head of the hash table
|
|
251 |
cur_bucket = self._hashtable + hash_offset |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
252 |
cur_line.next_line_index = cur_bucket.line_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
253 |
cur_bucket.line_index = i |
254 |
cur_bucket.count += 1 |
|
255 |
||
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
256 |
cdef int _extend_hash_table_raw(self, PyObject *seq_index) except -1: |
257 |
cdef Py_ssize_t new_count |
|
258 |
cdef Py_ssize_t new_total_len |
|
259 |
cdef Py_ssize_t old_len |
|
260 |
cdef PyObject *item |
|
261 |
cdef PyObject *should_index |
|
262 |
cdef Py_ssize_t i |
|
263 |
cdef Py_ssize_t line_index |
|
264 |
cdef _hash_bucket *cur_bucket |
|
265 |
cdef _raw_line *cur_line |
|
266 |
cdef _raw_line *next_line |
|
267 |
cdef Py_ssize_t hash_offset |
|
268 |
cdef PyObject *local_lines |
|
269 |
||
270 |
old_len = self._len_lines |
|
271 |
new_count = PySequence_Fast_GET_SIZE(seq_index) |
|
272 |
new_total_len = new_count + self._len_lines |
|
273 |
self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines, |
|
274 |
new_total_len * sizeof(_raw_line)) |
|
275 |
self._len_lines = new_total_len |
|
276 |
# Now that we have enough space, start adding the new lines
|
|
277 |
# into the array. These are done in forward order.
|
|
278 |
for i from 0 <= i < new_count: |
|
279 |
line_index = i + old_len |
|
280 |
cur_line = self._raw_lines + line_index |
|
281 |
item = PyList_GET_ITEM(self.lines, line_index) |
|
282 |
self._line_to_raw_line(item, cur_line) |
|
283 |
should_index = PySequence_Fast_GET_ITEM(seq_index, i) |
|
284 |
if PyObject_Not(should_index): |
|
285 |
cur_line.flags &= ~(<int>INDEXED) |
|
286 |
continue
|
|
287 |
hash_offset = self._find_hash_position(cur_line) |
|
288 |
||
289 |
# Point this line to the location in the hash table
|
|
290 |
cur_line.hash_offset = hash_offset |
|
291 |
||
292 |
# Make this line the tail of the hash table
|
|
293 |
cur_bucket = self._hashtable + hash_offset |
|
294 |
cur_bucket.count += 1 |
|
295 |
if cur_bucket.line_index == SENTINEL: |
|
296 |
cur_bucket.line_index = line_index |
|
297 |
continue
|
|
298 |
# We need to track through the pointers and insert this at
|
|
299 |
# the end
|
|
300 |
next_line = self._raw_lines + cur_bucket.line_index |
|
301 |
while next_line.next_line_index != SENTINEL: |
|
302 |
next_line = self._raw_lines + next_line.next_line_index |
|
303 |
next_line.next_line_index = line_index |
|
304 |
||
305 |
cdef int _extend_hash_table(self, object index) except -1: |
|
306 |
"""Add the last N entries in self.lines to the hash table. |
|
307 |
||
308 |
:param index: A sequence that declares whether each node should be
|
|
309 |
INDEXED or not.
|
|
310 |
"""
|
|
311 |
cdef PyObject *seq_index |
|
312 |
||
313 |
seq_index = PySequence_Fast(index, "expected a sequence for index") |
|
314 |
try: |
|
315 |
self._extend_hash_table_raw(seq_index) |
|
316 |
finally: |
|
317 |
Py_DECREF(seq_index) |
|
318 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
319 |
cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
320 |
"""Find the node in the hash which defines this line. |
321 |
||
322 |
Each bucket in the hash table points at exactly 1 equivalent line. If 2
|
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
323 |
objects would collide, we just increment to the next bucket until we
|
324 |
get to an empty bucket that is either empty or exactly matches this
|
|
325 |
object.
|
|
326 |
||
327 |
:return: The offset in the hash table for this entry
|
|
328 |
"""
|
|
329 |
cdef Py_ssize_t location |
|
330 |
cdef _raw_line *ref_line |
|
331 |
cdef Py_ssize_t ref_index |
|
332 |
cdef int compare_result |
|
333 |
||
334 |
location = line.hash & self._hashtable_bitmask |
|
335 |
ref_index = self._hashtable[location].line_index |
|
336 |
while ref_index != SENTINEL: |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
337 |
ref_line = self._raw_lines + ref_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
338 |
if (ref_line.hash == line.hash): |
339 |
PyObject_Cmp(ref_line.data, line.data, &compare_result) |
|
340 |
if compare_result == 0: |
|
341 |
break
|
|
342 |
location = (location + 1) & self._hashtable_bitmask |
|
343 |
ref_index = self._hashtable[location].line_index |
|
344 |
return location |
|
345 |
||
346 |
def _py_find_hash_position(self, line): |
|
347 |
"""Used only for testing. |
|
348 |
||
349 |
Return the location where this fits in the hash table
|
|
350 |
"""
|
|
351 |
cdef _raw_line raw_line |
|
352 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
353 |
self._line_to_raw_line(<PyObject *>line, &raw_line) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
354 |
return self._find_hash_position(&raw_line) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
355 |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
356 |
def _inspect_left_lines(self): |
357 |
"""Used only for testing. |
|
358 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
359 |
:return: None if _raw_lines is NULL,
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
360 |
else [(object, hash_val, hash_loc, next_val)] for each node in raw
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
361 |
lines.
|
362 |
"""
|
|
363 |
cdef Py_ssize_t i |
|
364 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
365 |
if self._raw_lines == NULL: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
366 |
return None |
367 |
||
368 |
result = [] |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
369 |
for i from 0 <= i < self._len_lines: |
370 |
PyList_Append(result, |
|
371 |
(<object>self._raw_lines[i].data, |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
372 |
self._raw_lines[i].hash, |
373 |
self._raw_lines[i].hash_offset, |
|
374 |
self._raw_lines[i].next_line_index, |
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
375 |
))
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
376 |
return result |
377 |
||
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
378 |
def _inspect_hash_table(self): |
379 |
"""Used only for testing. |
|
380 |
||
381 |
This iterates the hash table, and returns 'interesting' entries.
|
|
382 |
:return: (total_size, [(offset, line_index, count)] for entries that
|
|
383 |
are not empty.
|
|
384 |
"""
|
|
385 |
cdef int i |
|
386 |
||
387 |
interesting = [] |
|
388 |
for i from 0 <= i < self._hashtable_size: |
|
389 |
if self._hashtable[i].line_index != SENTINEL: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
390 |
PyList_Append(interesting, |
391 |
(i, self._hashtable[i].line_index, |
|
392 |
self._hashtable[i].count)) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
393 |
return (self._hashtable_size, interesting) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
394 |
|
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
395 |
def get_matches(self, line): |
396 |
"""Return the lines which match the line in right.""" |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
397 |
cdef Py_ssize_t hash_offset |
398 |
cdef _raw_line raw_line |
|
399 |
cdef _hash_bucket cur_bucket |
|
400 |
cdef Py_ssize_t cur_line_idx |
|
401 |
||
402 |
self._line_to_raw_line(<PyObject *>line, &raw_line) |
|
403 |
hash_offset = self._find_hash_position(&raw_line) |
|
404 |
cur_bucket = self._hashtable[hash_offset] |
|
405 |
cur_line_idx = cur_bucket.line_index |
|
406 |
if cur_line_idx == SENTINEL: |
|
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
407 |
return None |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
408 |
result = [] |
409 |
while cur_line_idx != SENTINEL: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
410 |
PyList_Append(result, cur_line_idx) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
411 |
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index |
412 |
assert len(result) == cur_bucket.count |
|
413 |
return result |
|
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
414 |
|
415 |
def _get_matching_lines(self): |
|
416 |
"""Return a dictionary showing matching lines.""" |
|
417 |
matching = {} |
|
418 |
for line in self.lines: |
|
419 |
matching[line] = self.get_matches(line) |
|
420 |
return matching |
|
421 |
||
422 |
def get_idx_matches(self, right_idx): |
|
423 |
"""Return the left lines matching the right line at the given offset.""" |
|
424 |
line = self._right_lines[right_idx] |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
425 |
return self.get_matches(line) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
426 |
|
427 |
def extend_lines(self, lines, index): |
|
428 |
"""Add more lines to the left-lines list. |
|
429 |
||
430 |
:param lines: A list of lines to add
|
|
431 |
:param index: A True/False for each node to define if it should be
|
|
432 |
indexed.
|
|
433 |
"""
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
434 |
cdef Py_ssize_t orig_len |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
435 |
cdef Py_ssize_t min_new_hash_size |
436 |
assert len(lines) == len(index) |
|
437 |
min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) + |
|
438 |
len(lines)) |
|
439 |
if self._hashtable_size >= min_new_hash_size: |
|
440 |
# Just add the new lines, don't bother resizing the hash table
|
|
441 |
self.lines.extend(lines) |
|
442 |
self._extend_hash_table(index) |
|
443 |
return
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
444 |
orig_len = len(self.lines) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
445 |
self.lines.extend(lines) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
446 |
self._lines_to_raw_lines(self.lines) |
447 |
for idx, val in enumerate(index): |
|
448 |
if not val: |
|
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
449 |
self._raw_lines[orig_len + idx].flags &= ~(<int>INDEXED) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
450 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
451 |
|
452 |
def set_right_lines(self, lines): |
|
453 |
"""Set the lines we will be matching against.""" |
|
454 |
self._right_lines = lines |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
455 |
|
456 |
||
457 |
def _get_longest_match(equivalence_table, pos, max_pos, locations): |
|
458 |
"""Get the longest possible match for the current position.""" |
|
459 |
cdef EquivalenceTable eq |
|
460 |
cdef int cpos |
|
461 |
||
462 |
eq = equivalence_table |
|
463 |
cpos = pos |
|
464 |
range_start = pos |
|
465 |
range_len = 0 |
|
466 |
copy_ends = None |
|
467 |
||
468 |
while pos < max_pos: |
|
469 |
if locations is None: |
|
470 |
locations = equivalence_table.get_idx_matches(pos) |
|
471 |
if locations is None: |
|
472 |
# No more matches, just return whatever we have, but we know that
|
|
473 |
# this last position is not going to match anything
|
|
474 |
pos = pos + 1 |
|
475 |
break
|
|
476 |
else: |
|
477 |
if copy_ends is None: |
|
478 |
# We are starting a new range
|
|
479 |
copy_ends = [] |
|
480 |
for loc in locations: |
|
481 |
copy_ends.append(loc + 1) |
|
482 |
range_len = 1 |
|
483 |
locations = None # Consumed |
|
484 |
else: |
|
485 |
# We are currently in the middle of a match
|
|
486 |
next_locations = set(copy_ends).intersection(locations) |
|
487 |
if len(next_locations): |
|
488 |
# range continues
|
|
489 |
copy_ends = [] |
|
490 |
for loc in next_locations: |
|
491 |
copy_ends.append(loc + 1) |
|
492 |
range_len = range_len + 1 |
|
493 |
locations = None # Consumed |
|
494 |
else: |
|
495 |
# But we are done with this match, we should be
|
|
496 |
# starting a new one, though. We will pass back 'locations'
|
|
497 |
# so that we don't have to do another lookup.
|
|
498 |
break
|
|
499 |
pos = pos + 1 |
|
500 |
if copy_ends is None: |
|
501 |
return None, pos, locations |
|
502 |
return ((min(copy_ends) - range_len, range_start, range_len)), pos, locations |
|
503 |
||
504 |