bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
1 |
# Copyright (C) 2008 Canonical Limited.
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License version 2 as published
|
|
5 |
# by the Free Software Foundation.
|
|
6 |
#
|
|
7 |
# This program is distributed in the hope that it will be useful,
|
|
8 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10 |
# GNU General Public License for more details.
|
|
11 |
#
|
|
12 |
# You should have received a copy of the GNU General Public License
|
|
13 |
# along with this program; if not, write to the Free Software
|
|
14 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
15 |
#
|
|
16 |
||
17 |
"""Compiled extensions for doing compression."""
|
|
18 |
||
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
19 |
cdef extern from *: |
20 |
ctypedef unsigned long size_t |
|
21 |
void * malloc(size_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
22 |
void * realloc(void *, size_t) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
23 |
void free(void *) |
24 |
||
25 |
cdef extern from "Python.h": |
|
26 |
struct _PyObject: |
|
27 |
pass
|
|
28 |
ctypedef _PyObject PyObject |
|
29 |
PyObject *PySequence_Fast(object, char *) except NULL |
|
30 |
Py_ssize_t PySequence_Fast_GET_SIZE(PyObject *) |
|
31 |
PyObject *PySequence_Fast_GET_ITEM(PyObject *, Py_ssize_t) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
32 |
PyObject *PyList_GET_ITEM(object, Py_ssize_t) |
33 |
int PyList_Append(object, object) except -1 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
34 |
long PyObject_Hash(PyObject *) except -1 |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
35 |
# We use PyObject_Cmp rather than PyObject_Compare because pyrex will check
|
36 |
# if there is an exception *for* us.
|
|
37 |
int PyObject_Cmp(PyObject *, PyObject *, int *result) except -1 |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
38 |
int PyObject_Not(PyObject *) except -1 |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
39 |
void Py_DECREF(PyObject *) |
40 |
void Py_INCREF(PyObject *) |
|
41 |
||
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
42 |
cdef enum _raw_line_flags: |
43 |
INDEXED = 0x01 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
44 |
|
45 |
cdef struct _raw_line: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
46 |
long hash # Cached form of the hash for this entry |
47 |
Py_ssize_t hash_offset # The location in the hash table for this object |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
48 |
Py_ssize_t next_line_index # Next line which is equivalent to this one |
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
49 |
int flags # status flags |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
50 |
PyObject *data # Raw pointer to the original line |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
51 |
|
52 |
||
53 |
cdef struct _hash_bucket: |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
54 |
Py_ssize_t line_index # First line in the left side for this bucket |
55 |
Py_ssize_t count # Number of equivalent lines, DO we even need this? |
|
56 |
||
57 |
||
58 |
cdef Py_ssize_t SENTINEL |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
59 |
SENTINEL = -1 |
60 |
||
61 |
||
62 |
cdef void *safe_malloc(size_t count) except NULL: |
|
63 |
cdef void *result |
|
64 |
result = malloc(count) |
|
65 |
if result == NULL: |
|
66 |
raise MemoryError('Failed to allocate %d bytes of memory' % (count,)) |
|
67 |
return result |
|
68 |
||
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
69 |
|
70 |
cdef void *safe_realloc(void * old, size_t count) except NULL: |
|
71 |
cdef void *result |
|
72 |
result = realloc(old, count) |
|
73 |
if result == NULL: |
|
74 |
raise MemoryError('Failed to reallocate to %d bytes of memory' |
|
75 |
% (count,)) |
|
76 |
return result |
|
77 |
||
78 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
79 |
cdef int safe_free(void **val) except -1: |
80 |
assert val != NULL |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
81 |
if val[0] != NULL: |
82 |
free(val[0]) |
|
83 |
val[0] = NULL |
|
84 |
||
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
85 |
|
86 |
cdef class EquivalenceTable: |
|
87 |
"""This tracks equivalencies between lists of hashable objects. |
|
88 |
||
89 |
:ivar lines: The 'static' lines that will be preserved between runs.
|
|
90 |
"""
|
|
91 |
||
92 |
cdef readonly object lines |
|
93 |
cdef readonly object _right_lines |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
94 |
cdef Py_ssize_t _hashtable_size |
95 |
cdef Py_ssize_t _hashtable_bitmask |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
96 |
cdef _hash_bucket *_hashtable |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
97 |
cdef _raw_line *_raw_lines |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
98 |
cdef Py_ssize_t _len_lines |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
99 |
|
100 |
def __init__(self, lines): |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
101 |
self.lines = list(lines) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
102 |
self._right_lines = None |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
103 |
self._hashtable_size = 0 |
104 |
self._hashtable = NULL |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
105 |
self._raw_lines = NULL |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
106 |
self._len_lines = 0 |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
107 |
self._lines_to_raw_lines(lines) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
108 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
109 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
110 |
def __dealloc__(self): |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
111 |
safe_free(<void**>&self._hashtable) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
112 |
safe_free(<void**>&self._raw_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
113 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
114 |
cdef int _line_to_raw_line(self, PyObject *line, _raw_line *raw_line) except -1: |
115 |
"""Convert a single PyObject into a raw line.""" |
|
116 |
raw_line.hash = PyObject_Hash(line) |
|
117 |
raw_line.next_line_index = SENTINEL |
|
118 |
raw_line.hash_offset = SENTINEL |
|
|
0.18.22
by John Arbash Meinel
Switch the indexed integer into a flags integer. |
119 |
raw_line.flags = INDEXED |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
120 |
raw_line.data = line |
121 |
||
122 |
cdef int _lines_to_raw_lines(self, object lines) except -1: |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
123 |
"""Load a sequence of objects into the _raw_line format""" |
124 |
cdef Py_ssize_t count, i |
|
125 |
cdef PyObject *seq |
|
126 |
cdef PyObject *item |
|
127 |
cdef _raw_line *raw |
|
128 |
||
129 |
# Do we want to use PySequence_Fast, or just assume that it is a list
|
|
130 |
# Also, do we need to decref the return value?
|
|
131 |
# http://www.python.org/doc/current/api/sequence.html
|
|
132 |
seq = PySequence_Fast(lines, "expected a sequence") |
|
133 |
try: |
|
134 |
count = PySequence_Fast_GET_SIZE(seq) |
|
135 |
if count == 0: |
|
136 |
return 0 |
|
137 |
raw = <_raw_line*>safe_malloc(count * sizeof(_raw_line)) |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
138 |
safe_free(<void**>&self._raw_lines) |
139 |
self._raw_lines = raw |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
140 |
self._len_lines = count |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
141 |
|
142 |
for i from 0 <= i < count: |
|
143 |
item = PySequence_Fast_GET_ITEM(seq, i) |
|
144 |
# NB: We don't Py_INCREF the data pointer, because we know we
|
|
145 |
# maintain a pointer to the item in self.lines or
|
|
146 |
# self._right_lines
|
|
147 |
# TODO: Do we even need to track a data pointer here? It would
|
|
148 |
# be less memory, and we *could* just look it up in the
|
|
149 |
# appropriate line list.
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
150 |
self._line_to_raw_line(item, &raw[i]) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
151 |
finally: |
152 |
# TODO: Unfortunately, try/finally always generates a compiler
|
|
153 |
# warning about a possibly unused variable :(
|
|
154 |
Py_DECREF(seq) |
|
155 |
return count |
|
156 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
157 |
cdef Py_ssize_t _compute_minimum_hash_size(self, Py_ssize_t needed): |
158 |
"""Determine the smallest hash size that can reasonably fit the data. |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
159 |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
160 |
:param needed: The number of entries we might be inserting into
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
161 |
the hash (assuming there are no duplicate lines.)
|
162 |
:return: The number of hash table entries to use. This will always be a
|
|
163 |
power of two.
|
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
164 |
"""
|
165 |
cdef Py_ssize_t hash_size |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
166 |
cdef Py_ssize_t min_size |
167 |
||
168 |
# TODO: Another alternative would be to actually count how full the
|
|
169 |
# hash-table is, and decide if we need to grow it based on
|
|
170 |
# density. That can take into account duplicated lines. Though if
|
|
171 |
# we compress well, there should be a minimal amount of
|
|
172 |
# duplicated lines in the output.
|
|
173 |
||
174 |
# At the bare minimum, we could fit all entries into a 'needed'
|
|
175 |
# size hash table. However, any collision would then have a long way to
|
|
176 |
# traverse before it could find a 'free' slot.
|
|
177 |
# So we set the minimum size to give us 33% empty slots.
|
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
178 |
min_size = <Py_ssize_t>(needed * 2) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
179 |
hash_size = 1 |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
180 |
while hash_size < min_size: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
181 |
hash_size = hash_size << 1 |
182 |
return hash_size |
|
183 |
||
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
184 |
def _py_compute_minimum_hash_size(self, needed): |
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
185 |
"""Expose _compute_minimum_hash_size to python for testing.""" |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
186 |
return self._compute_minimum_hash_size(needed) |
187 |
||
188 |
cdef Py_ssize_t _compute_recommended_hash_size(self, Py_ssize_t needed): |
|
189 |
"""Determine a reasonable hash size, assuming some room for growth. |
|
190 |
|
|
191 |
:param needed: The number of entries we might be inserting into
|
|
192 |
the hash (assuming there are no duplicate lines.)
|
|
193 |
:return: The number of hash table entries to use. This will always be a
|
|
194 |
power of two.
|
|
195 |
"""
|
|
196 |
cdef Py_ssize_t hash_size |
|
197 |
cdef Py_ssize_t min_size |
|
198 |
||
199 |
# We start off with a 8k hash (after doubling), because there isn't a
|
|
200 |
# lot of reason to go smaller than that (this class isn't one you'll be
|
|
201 |
# instantiating thousands of, and you are always likely to grow here.)
|
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
202 |
hash_size = 2048 |
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
203 |
while hash_size < needed: |
204 |
hash_size = hash_size << 1 |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
205 |
# And we always give at least 4x blank space
|
206 |
hash_size = hash_size << 2 |
|
|
0.18.16
by John Arbash Meinel
Test the recommended versus minimum hash table sizes. |
207 |
return hash_size |
208 |
||
209 |
def _py_compute_recommended_hash_size(self, needed): |
|
210 |
"""Expose _compute_recommended_hash_size to python for testing.""" |
|
211 |
return self._compute_recommended_hash_size(needed) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
212 |
|
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
213 |
cdef int _build_hash_table(self) except -1: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
214 |
"""Build the hash table 'from scratch'.""" |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
215 |
cdef Py_ssize_t hash_size |
216 |
cdef Py_ssize_t hash_bitmask |
|
217 |
cdef Py_ssize_t i |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
218 |
cdef _raw_line *cur_line |
219 |
cdef Py_ssize_t hash_offset |
|
220 |
cdef _hash_bucket *cur_bucket |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
221 |
cdef _hash_bucket *new_hashtable |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
222 |
|
223 |
# Hash size is a power of 2
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
224 |
hash_size = self._compute_recommended_hash_size(self._len_lines) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
225 |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
226 |
new_hashtable = <_hash_bucket*>safe_malloc(sizeof(_hash_bucket) * |
227 |
hash_size) |
|
228 |
safe_free(<void**>&self._hashtable) |
|
229 |
self._hashtable = new_hashtable |
|
230 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
231 |
self._hashtable_size = hash_size |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
232 |
for i from 0 <= i < hash_size: |
233 |
self._hashtable[i].line_index = SENTINEL |
|
234 |
self._hashtable[i].count = 0 |
|
235 |
||
236 |
# Turn the hash size into a bitmask
|
|
237 |
self._hashtable_bitmask = hash_size - 1 |
|
238 |
||
239 |
# Iterate backwards, because it makes it easier to insert items into
|
|
240 |
# the hash (you just change the head pointer, and everything else keeps
|
|
241 |
# pointing to the same location).
|
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
242 |
for i from self._len_lines > i >= 0: |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
243 |
cur_line = self._raw_lines + i |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
244 |
if not (cur_line.flags & INDEXED == INDEXED): |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
245 |
continue
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
246 |
hash_offset = self._find_hash_position(cur_line) |
247 |
||
248 |
# Point this line to the location in the hash table
|
|
249 |
cur_line.hash_offset = hash_offset |
|
250 |
# And make this line the head of the hash table
|
|
251 |
cur_bucket = self._hashtable + hash_offset |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
252 |
cur_line.next_line_index = cur_bucket.line_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
253 |
cur_bucket.line_index = i |
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
254 |
cur_bucket.count = cur_bucket.count + 1 |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
255 |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
256 |
cdef int _extend_raw_lines(self, object index) except -1: |
257 |
"""Add the last N lines from self.lines into the raw_lines array.""" |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
258 |
cdef Py_ssize_t new_count |
259 |
cdef Py_ssize_t new_total_len |
|
260 |
cdef Py_ssize_t old_len |
|
261 |
cdef PyObject *item |
|
262 |
cdef PyObject *should_index |
|
263 |
cdef Py_ssize_t i |
|
264 |
cdef Py_ssize_t line_index |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
265 |
cdef _raw_line *cur_line |
266 |
cdef PyObject *seq_index |
|
267 |
||
268 |
seq_index = PySequence_Fast(index, "expected a sequence for index") |
|
269 |
try: |
|
270 |
old_len = self._len_lines |
|
271 |
new_count = PySequence_Fast_GET_SIZE(seq_index) |
|
272 |
new_total_len = new_count + self._len_lines |
|
273 |
self._raw_lines = <_raw_line*>safe_realloc(<void*>self._raw_lines, |
|
274 |
new_total_len * sizeof(_raw_line)) |
|
275 |
self._len_lines = new_total_len |
|
276 |
# Now that we have enough space, start adding the new lines
|
|
277 |
# into the array. These are done in forward order.
|
|
278 |
for i from 0 <= i < new_count: |
|
279 |
line_index = i + old_len |
|
280 |
cur_line = self._raw_lines + line_index |
|
281 |
item = PyList_GET_ITEM(self.lines, line_index) |
|
282 |
self._line_to_raw_line(item, cur_line) |
|
283 |
should_index = PySequence_Fast_GET_ITEM(seq_index, i) |
|
284 |
if PyObject_Not(should_index): |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
285 |
cur_line.flags = cur_line.flags & ~(<int>INDEXED) |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
286 |
finally: |
287 |
Py_DECREF(seq_index) |
|
288 |
||
289 |
cdef int _extend_hash_table(self, Py_ssize_t count) except -1: |
|
290 |
"""Add the last N entries in self.lines to the hash table. |
|
291 |
||
292 |
:param index: A sequence that declares whether each node should be
|
|
293 |
INDEXED or not.
|
|
294 |
"""
|
|
295 |
cdef Py_ssize_t line_index |
|
296 |
cdef Py_ssize_t start |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
297 |
cdef _raw_line *cur_line |
298 |
cdef _raw_line *next_line |
|
299 |
cdef Py_ssize_t hash_offset |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
300 |
cdef _hash_bucket *cur_bucket |
301 |
||
302 |
start = self._len_lines - count |
|
303 |
||
304 |
for line_index from start <= line_index < self._len_lines: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
305 |
cur_line = self._raw_lines + line_index |
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
306 |
if cur_line.flags & INDEXED != INDEXED: |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
307 |
continue
|
308 |
hash_offset = self._find_hash_position(cur_line) |
|
309 |
||
310 |
# Point this line to the location in the hash table
|
|
311 |
cur_line.hash_offset = hash_offset |
|
312 |
||
313 |
# Make this line the tail of the hash table
|
|
314 |
cur_bucket = self._hashtable + hash_offset |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
315 |
cur_bucket.count = cur_bucket.count + 1 |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
316 |
if cur_bucket.line_index == SENTINEL: |
317 |
cur_bucket.line_index = line_index |
|
318 |
continue
|
|
319 |
# We need to track through the pointers and insert this at
|
|
320 |
# the end
|
|
321 |
next_line = self._raw_lines + cur_bucket.line_index |
|
322 |
while next_line.next_line_index != SENTINEL: |
|
323 |
next_line = self._raw_lines + next_line.next_line_index |
|
324 |
next_line.next_line_index = line_index |
|
325 |
||
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
326 |
cdef Py_ssize_t _find_hash_position(self, _raw_line *line) except -1: |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
327 |
"""Find the node in the hash which defines this line. |
328 |
||
329 |
Each bucket in the hash table points at exactly 1 equivalent line. If 2
|
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
330 |
objects would collide, we just increment to the next bucket until we
|
331 |
get to an empty bucket that is either empty or exactly matches this
|
|
332 |
object.
|
|
333 |
||
334 |
:return: The offset in the hash table for this entry
|
|
335 |
"""
|
|
336 |
cdef Py_ssize_t location |
|
337 |
cdef _raw_line *ref_line |
|
338 |
cdef Py_ssize_t ref_index |
|
339 |
cdef int compare_result |
|
340 |
||
341 |
location = line.hash & self._hashtable_bitmask |
|
342 |
ref_index = self._hashtable[location].line_index |
|
343 |
while ref_index != SENTINEL: |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
344 |
ref_line = self._raw_lines + ref_index |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
345 |
if (ref_line.hash == line.hash): |
346 |
PyObject_Cmp(ref_line.data, line.data, &compare_result) |
|
347 |
if compare_result == 0: |
|
348 |
break
|
|
349 |
location = (location + 1) & self._hashtable_bitmask |
|
350 |
ref_index = self._hashtable[location].line_index |
|
351 |
return location |
|
352 |
||
353 |
def _py_find_hash_position(self, line): |
|
354 |
"""Used only for testing. |
|
355 |
||
356 |
Return the location where this fits in the hash table
|
|
357 |
"""
|
|
358 |
cdef _raw_line raw_line |
|
359 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
360 |
self._line_to_raw_line(<PyObject *>line, &raw_line) |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
361 |
return self._find_hash_position(&raw_line) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
362 |
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
363 |
def _inspect_left_lines(self): |
364 |
"""Used only for testing. |
|
365 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
366 |
:return: None if _raw_lines is NULL,
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
367 |
else [(object, hash_val, hash_loc, next_val)] for each node in raw
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
368 |
lines.
|
369 |
"""
|
|
370 |
cdef Py_ssize_t i |
|
371 |
||
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
372 |
if self._raw_lines == NULL: |
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
373 |
return None |
374 |
||
375 |
result = [] |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
376 |
for i from 0 <= i < self._len_lines: |
377 |
PyList_Append(result, |
|
378 |
(<object>self._raw_lines[i].data, |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
379 |
self._raw_lines[i].hash, |
380 |
self._raw_lines[i].hash_offset, |
|
381 |
self._raw_lines[i].next_line_index, |
|
|
0.18.18
by John Arbash Meinel
Start actually storing matches in the hash table, and testing the result. |
382 |
))
|
|
0.18.17
by John Arbash Meinel
We now build the appropriate hash table entries. |
383 |
return result |
384 |
||
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
385 |
def _inspect_hash_table(self): |
386 |
"""Used only for testing. |
|
387 |
||
388 |
This iterates the hash table, and returns 'interesting' entries.
|
|
389 |
:return: (total_size, [(offset, line_index, count)] for entries that
|
|
390 |
are not empty.
|
|
391 |
"""
|
|
392 |
cdef int i |
|
393 |
||
394 |
interesting = [] |
|
395 |
for i from 0 <= i < self._hashtable_size: |
|
396 |
if self._hashtable[i].line_index != SENTINEL: |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
397 |
PyList_Append(interesting, |
398 |
(i, self._hashtable[i].line_index, |
|
399 |
self._hashtable[i].count)) |
|
|
0.18.15
by John Arbash Meinel
Start writing tests directly for the compiled class |
400 |
return (self._hashtable_size, interesting) |
|
0.18.14
by John Arbash Meinel
A bit more work, not really usable yet. |
401 |
|
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
402 |
def get_matches(self, line): |
403 |
"""Return the lines which match the line in right.""" |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
404 |
cdef Py_ssize_t count |
405 |
cdef Py_ssize_t i |
|
406 |
cdef Py_ssize_t *matches |
|
407 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
408 |
matches = NULL |
409 |
||
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
410 |
try: |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
411 |
count = self.get_raw_matches(<PyObject *>line, &matches) |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
412 |
if count == 0: |
413 |
return None |
|
414 |
result = [] |
|
415 |
for i from 0 <= i < count: |
|
416 |
PyList_Append(result, matches[i]) |
|
417 |
finally: |
|
418 |
safe_free(<void**>&matches) |
|
419 |
return result |
|
420 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
421 |
cdef Py_ssize_t get_raw_matches(self, PyObject *line, |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
422 |
Py_ssize_t **matches) except -1: |
423 |
"""Get all the possible entries that match the given location. |
|
424 |
||
425 |
:param line: A Python line which we want to match
|
|
426 |
:param pos: The index position in _right_lines
|
|
427 |
:param matches: A variable to hold an array containing all the matches.
|
|
428 |
This will be allocated using malloc, and the caller is responsible
|
|
429 |
to free it with free(). If there are no matches, no memory will be
|
|
430 |
allocated.
|
|
431 |
:retrun: The number of matched positions
|
|
432 |
"""
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
433 |
cdef Py_ssize_t hash_offset |
434 |
cdef _raw_line raw_line |
|
435 |
cdef _hash_bucket cur_bucket |
|
436 |
cdef Py_ssize_t cur_line_idx |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
437 |
cdef Py_ssize_t count |
438 |
cdef Py_ssize_t *local_matches |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
439 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
440 |
self._line_to_raw_line(line, &raw_line) |
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
441 |
hash_offset = self._find_hash_position(&raw_line) |
442 |
cur_bucket = self._hashtable[hash_offset] |
|
443 |
cur_line_idx = cur_bucket.line_index |
|
444 |
if cur_line_idx == SENTINEL: |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
445 |
return 0 |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
446 |
local_matches = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * |
447 |
(cur_bucket.count + 1)) |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
448 |
matches[0] = local_matches |
449 |
for count from 0 <= count < cur_bucket.count: |
|
450 |
assert cur_line_idx != SENTINEL |
|
451 |
local_matches[count] = cur_line_idx |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
452 |
cur_line_idx = self._raw_lines[cur_line_idx].next_line_index |
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
453 |
return cur_bucket.count |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
454 |
|
455 |
def _get_matching_lines(self): |
|
456 |
"""Return a dictionary showing matching lines.""" |
|
457 |
matching = {} |
|
458 |
for line in self.lines: |
|
459 |
matching[line] = self.get_matches(line) |
|
460 |
return matching |
|
461 |
||
462 |
def get_idx_matches(self, right_idx): |
|
463 |
"""Return the left lines matching the right line at the given offset.""" |
|
464 |
line = self._right_lines[right_idx] |
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
465 |
return self.get_matches(line) |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
466 |
|
467 |
def extend_lines(self, lines, index): |
|
468 |
"""Add more lines to the left-lines list. |
|
469 |
||
470 |
:param lines: A list of lines to add
|
|
471 |
:param index: A True/False for each node to define if it should be
|
|
472 |
indexed.
|
|
473 |
"""
|
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
474 |
cdef Py_ssize_t orig_len |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
475 |
cdef Py_ssize_t min_new_hash_size |
476 |
assert len(lines) == len(index) |
|
477 |
min_new_hash_size = self._compute_minimum_hash_size(len(self.lines) + |
|
478 |
len(lines)) |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
479 |
orig_len = len(self.lines) |
480 |
self.lines.extend(lines) |
|
481 |
self._extend_raw_lines(index) |
|
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
482 |
if self._hashtable_size >= min_new_hash_size: |
483 |
# Just add the new lines, don't bother resizing the hash table
|
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
484 |
self._extend_hash_table(len(index)) |
|
0.18.23
by John Arbash Meinel
Now we can add more lines without having to rebuild the whole hash |
485 |
return
|
|
0.18.21
by John Arbash Meinel
Add support for not including certain lines in the hashtable. |
486 |
self._build_hash_table() |
|
0.18.13
by John Arbash Meinel
Copy the EquivalenceTable code into pyrex and get it under test. |
487 |
|
488 |
def set_right_lines(self, lines): |
|
489 |
"""Set the lines we will be matching against.""" |
|
490 |
self._right_lines = lines |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
491 |
|
492 |
||
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
493 |
cdef Py_ssize_t intersect_sorted(Py_ssize_t *base, Py_ssize_t base_count, |
494 |
Py_ssize_t *new_values, Py_ssize_t new_count) except -1: |
|
495 |
"""Intersect the base values with the new values. |
|
496 |
||
497 |
It is assumed that both base and new_values are sorted in ascending order,
|
|
498 |
with no duplicates.
|
|
499 |
||
500 |
The values in 'base' will be modified to only contain the entries that are
|
|
501 |
in both base and new. If nothing matches, 'base' will not be modified
|
|
502 |
||
503 |
:param base: An array of base values
|
|
504 |
:param base_count: The length of the base array
|
|
505 |
:param new_values: An array of values to compare to base, will be modified
|
|
506 |
to only contain matching entries
|
|
507 |
:param new_count: The number of new entries
|
|
508 |
:return: The count of matching values
|
|
509 |
"""
|
|
510 |
cdef Py_ssize_t *cur_base |
|
511 |
cdef Py_ssize_t *cur_new |
|
512 |
cdef Py_ssize_t *cur_out |
|
513 |
cdef Py_ssize_t count |
|
514 |
cdef Py_ssize_t *base_end |
|
515 |
cdef Py_ssize_t *new_end |
|
516 |
||
517 |
cur_base = base |
|
518 |
cur_out = base |
|
519 |
cur_new = new_values |
|
520 |
base_end = base + base_count |
|
521 |
new_end = new_values + new_count |
|
522 |
count = 0 |
|
523 |
||
524 |
while cur_base < base_end and cur_new < new_end: |
|
525 |
if cur_base[0] == cur_new[0]: |
|
526 |
# This may be assigning to self.
|
|
527 |
cur_out[0] = cur_base[0] |
|
|
0.18.32
by John Arbash Meinel
Switch away from += for older versions of pyrex, |
528 |
count = count + 1 |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
529 |
cur_out = cur_out + 1 |
530 |
# We matched, so move both pointers
|
|
531 |
cur_base = cur_base + 1 |
|
532 |
cur_new = cur_new + 1 |
|
533 |
elif cur_base[0] < cur_new[0]: |
|
534 |
# cur_base is "behind" so increment it
|
|
535 |
cur_base = cur_base + 1 |
|
536 |
else: |
|
537 |
# cur_new must be "behind" move its pointer
|
|
538 |
cur_new = cur_new + 1 |
|
539 |
return count |
|
540 |
||
541 |
||
542 |
cdef object array_as_list(Py_ssize_t *array, Py_ssize_t count): |
|
543 |
cdef int i |
|
544 |
lst = [] |
|
545 |
for i from 0 <= i < count: |
|
546 |
lst.append(array[i]) |
|
547 |
return lst |
|
548 |
||
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
549 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
550 |
def _get_longest_match(equivalence_table, pos, max_pos, locations): |
551 |
"""Get the longest possible match for the current position.""" |
|
552 |
cdef EquivalenceTable eq |
|
|
0.18.28
by John Arbash Meinel
Add a function to work in raw C arrays instead of a python list object |
553 |
cdef Py_ssize_t cpos |
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
554 |
cdef Py_ssize_t cmax_pos |
555 |
cdef Py_ssize_t *matches |
|
556 |
cdef Py_ssize_t match_count |
|
557 |
cdef PyObject *line |
|
558 |
cdef Py_ssize_t *copy_ends |
|
559 |
cdef Py_ssize_t copy_count |
|
560 |
cdef Py_ssize_t range_len |
|
561 |
cdef Py_ssize_t in_start |
|
562 |
cdef Py_ssize_t i |
|
563 |
cdef Py_ssize_t intersect_count |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
564 |
|
565 |
eq = equivalence_table |
|
566 |
cpos = pos |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
567 |
cmax_pos = max_pos |
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
568 |
range_start = pos |
569 |
range_len = 0 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
570 |
matches = NULL |
571 |
match_count = 0 |
|
572 |
copy_ends = NULL |
|
573 |
copy_count = 0 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
574 |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
575 |
# TODO: Handle when locations is not None
|
576 |
while cpos < cmax_pos: |
|
|
0.18.31
by John Arbash Meinel
We had a small bug when we had to rebuild the hash, as we would forget about the non-indexed entries. |
577 |
# TODO: Instead of grabbing the full list of matches and then doing an
|
578 |
# intersection, use a helper that does the intersection
|
|
579 |
# as-it-goes.
|
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
580 |
line = PyList_GET_ITEM(eq._right_lines, cpos) |
581 |
safe_free(<void**>&matches) |
|
582 |
match_count = eq.get_raw_matches(line, &matches) |
|
583 |
if match_count == 0: |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
584 |
# No more matches, just return whatever we have, but we know that
|
585 |
# this last position is not going to match anything
|
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
586 |
cpos = cpos + 1 |
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
587 |
break
|
588 |
else: |
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
589 |
if copy_ends == NULL: |
590 |
# We are starting a new range, just steal the matches
|
|
591 |
copy_ends = matches |
|
592 |
copy_count = match_count |
|
593 |
matches = NULL |
|
594 |
match_count = 0 |
|
595 |
for i from 0 <= i < copy_count: |
|
596 |
copy_ends[i] = copy_ends[i] + 1 |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
597 |
range_len = 1 |
598 |
else: |
|
599 |
# We are currently in the middle of a match
|
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
600 |
intersect_count = intersect_sorted(copy_ends, copy_count, |
601 |
matches, match_count) |
|
602 |
if intersect_count == 0: |
|
|
0.18.26
by John Arbash Meinel
Start with a copy implementation of the _get_longest_match function. |
603 |
# But we are done with this match, we should be
|
604 |
# starting a new one, though. We will pass back 'locations'
|
|
605 |
# so that we don't have to do another lookup.
|
|
606 |
break
|
|
|
0.18.29
by John Arbash Meinel
Implement _get_longest_match in Pyrex. |
607 |
else: |
608 |
copy_count = intersect_count |
|
609 |
# range continues, step the copy ends
|
|
610 |
for i from 0 <= i < copy_count: |
|
611 |
copy_ends[i] = copy_ends[i] + 1 |
|
612 |
range_len = range_len + 1 |
|
613 |
safe_free(<void **>&matches) |
|
614 |
match_count = 0 |
|
615 |
cpos = cpos + 1 |
|
616 |
if matches == NULL: |
|
617 |
# We consumed all of our matches
|
|
618 |
locations = None |
|
619 |
else: |
|
620 |
locations = array_as_list(matches, match_count) |
|
621 |
safe_free(<void **>&matches) |
|
622 |
if copy_ends == NULL or copy_count == 0: |
|
623 |
# We never had a match
|
|
624 |
return None, cpos, locations |
|
625 |
# Because copy_ends is always sorted, we know the 'min' is just the first
|
|
626 |
# node
|
|
627 |
in_start = copy_ends[0] |
|
628 |
safe_free(<void**>©_ends) |
|
629 |
return ((in_start - range_len), range_start, range_len), cpos, locations |