bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
1 |
# Copyright (C) 2008 Canonical Ltd
|
2 |
#
|
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
16 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
17 |
"""Persistent maps from tuple_of_strings->string using CHK stores.
|
18 |
||
19 |
Overview and current status:
|
|
20 |
||
21 |
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
|
|
22 |
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
|
|
23 |
joining them by \x00, and \x00 padding shorter keys out to the length of the
|
|
24 |
longest key. Leaf nodes are packed as densely as possible, and internal nodes
|
|
25 |
are all and additional 8-bits wide leading to a sparse upper tree.
|
|
26 |
||
27 |
Updates to a CHKMap are done preferentially via the apply_delta method, to
|
|
28 |
allow optimisation of the update operation; but individual map/unmap calls are
|
|
29 |
possible and supported. All changes via map/unmap are buffered in memory until
|
|
30 |
the _save method is called to force serialisation of the tree. apply_delta
|
|
31 |
performs a _save implicitly.
|
|
32 |
||
33 |
TODO:
|
|
34 |
-----
|
|
35 |
||
36 |
Densely packed upper nodes.
|
|
37 |
||
38 |
"""
|
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
39 |
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
40 |
import heapq |
|
3735.9.18
by John Arbash Meinel
Make the versionedfile import lazy. |
41 |
from bzrlib import lazy_import |
42 |
lazy_import.lazy_import(globals(), """ |
|
43 |
from bzrlib import versionedfile
|
|
44 |
""") |
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
45 |
|
46 |
||
47 |
class CHKMap(object): |
|
48 |
"""A persistent map from string to string backed by a CHK store.""" |
|
49 |
||
50 |
def __init__(self, store, root_key): |
|
51 |
"""Create a CHKMap object. |
|
52 |
||
53 |
:param store: The store the CHKMap is stored in.
|
|
54 |
:param root_key: The root key of the map. None to create an empty
|
|
55 |
CHKMap.
|
|
56 |
"""
|
|
57 |
self._store = store |
|
58 |
if root_key is None: |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
59 |
self._root_node = LeafNode() |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
60 |
else: |
|
3735.2.41
by Robert Collins
Make the parent_id_basename index be updated during CHKInventory.apply_delta. |
61 |
self._root_node = self._node_key(root_key) |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
62 |
|
63 |
def apply_delta(self, delta): |
|
64 |
"""Apply a delta to the map. |
|
65 |
||
66 |
:param delta: An iterable of old_key, new_key, new_value tuples.
|
|
67 |
If new_key is not None, then new_key->new_value is inserted
|
|
68 |
into the map; if old_key is not None, then the old mapping
|
|
69 |
of old_key is removed.
|
|
70 |
"""
|
|
71 |
for old, new, value in delta: |
|
72 |
if old is not None and old != new: |
|
73 |
# unmap
|
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
74 |
self.unmap(old) |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
75 |
for old, new, value in delta: |
76 |
if new is not None: |
|
77 |
# map
|
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
78 |
self.map(new, value) |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
79 |
return self._save() |
80 |
||
81 |
def _ensure_root(self): |
|
82 |
"""Ensure that the root node is an object not a key.""" |
|
83 |
if type(self._root_node) == tuple: |
|
84 |
# Demand-load the root
|
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
85 |
self._root_node = self._get_node(self._root_node) |
86 |
||
87 |
def _get_node(self, node): |
|
88 |
"""Get a node. |
|
89 |
||
90 |
Node that this does not update the _items dict in objects containing a
|
|
91 |
reference to this node. As such it does not prevent subsequent IO being
|
|
92 |
performed.
|
|
93 |
|
|
94 |
:param node: A tuple key or node object.
|
|
95 |
:return: A node object.
|
|
96 |
"""
|
|
97 |
if type(node) == tuple: |
|
98 |
bytes = self._read_bytes(node) |
|
99 |
return _deserialise(bytes, node) |
|
100 |
else: |
|
101 |
return node |
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
102 |
|
103 |
def _read_bytes(self, key): |
|
104 |
stream = self._store.get_record_stream([key], 'unordered', True) |
|
105 |
return stream.next().get_bytes_as('fulltext') |
|
106 |
||
|
3735.9.2
by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on. |
107 |
def _dump_tree(self): |
108 |
"""Return the tree in a string representation.""" |
|
109 |
self._ensure_root() |
|
110 |
res = self._dump_tree_node(self._root_node, prefix='', indent='') |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
111 |
return '\n'.join(res) |
|
3735.9.2
by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on. |
112 |
|
113 |
def _dump_tree_node(self, node, prefix, indent): |
|
114 |
"""For this node and all children, generate a string representation.""" |
|
115 |
result = [] |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
116 |
node_key = node.key() |
117 |
if node_key is not None: |
|
118 |
node_key = node_key[0] |
|
119 |
result.append('%s%r %s %s' % (indent, prefix, node.__class__.__name__, |
|
120 |
node_key)) |
|
|
3735.9.2
by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on. |
121 |
if isinstance(node, InternalNode): |
122 |
# Trigger all child nodes to get loaded
|
|
123 |
list(node._iter_nodes(self._store)) |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
124 |
for prefix, sub in sorted(node._items.iteritems()): |
|
3735.9.2
by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on. |
125 |
result.extend(self._dump_tree_node(sub, prefix, indent + ' ')) |
126 |
else: |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
127 |
for key, value in sorted(node._items.iteritems()): |
128 |
result.append(' %r %r' % (key, value)) |
|
|
3735.9.2
by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on. |
129 |
return result |
130 |
||
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
131 |
@classmethod
|
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
132 |
def from_dict(klass, store, initial_value, maximum_size=0, key_width=1): |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
133 |
"""Create a CHKMap in store with initial_value as the content. |
134 |
|
|
135 |
:param store: The store to record initial_value in, a VersionedFiles
|
|
136 |
object with 1-tuple keys supporting CHK key generation.
|
|
137 |
:param initial_value: A dict to store in store. Its keys and values
|
|
138 |
must be bytestrings.
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
139 |
:param maximum_size: The maximum_size rule to apply to nodes. This
|
140 |
determines the size at which no new data is added to a single node.
|
|
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
141 |
:param key_width: The number of elements in each key_tuple being stored
|
142 |
in this map.
|
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
143 |
:return: The root chk of te resulting CHKMap.
|
144 |
"""
|
|
145 |
result = CHKMap(store, None) |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
146 |
result._root_node.set_maximum_size(maximum_size) |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
147 |
result._root_node._key_width = key_width |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
148 |
delta = [] |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
149 |
for key, value in initial_value.items(): |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
150 |
delta.append((None, key, value)) |
151 |
result.apply_delta(delta) |
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
152 |
return result._save() |
153 |
||
|
3735.2.30
by Robert Collins
Start iter_changes between CHKMap instances. |
154 |
def iter_changes(self, basis): |
155 |
"""Iterate over the changes between basis and self. |
|
156 |
||
157 |
:return: An iterator of tuples: (key, old_value, new_value). Old_value
|
|
158 |
is None for keys only in self; new_value is None for keys only in
|
|
159 |
basis.
|
|
160 |
"""
|
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
161 |
# Overview:
|
162 |
# Read both trees in lexographic, highest-first order.
|
|
163 |
# Any identical nodes we skip
|
|
164 |
# Any unique prefixes we output immediately.
|
|
165 |
# values in a leaf node are treated as single-value nodes in the tree
|
|
166 |
# which allows them to be not-special-cased. We know to output them
|
|
167 |
# because their value is a string, not a key(tuple) or node.
|
|
168 |
#
|
|
169 |
# corner cases to beware of when considering this function:
|
|
170 |
# *) common references are at different heights.
|
|
171 |
# consider two trees:
|
|
172 |
# {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
|
|
173 |
# {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'}, 'ab':LeafNode={'ab':'bar'}}
|
|
174 |
# 'b': LeafNode={'b'}}
|
|
175 |
# the node with aaa/aab will only be encountered in the second tree
|
|
176 |
# after reading the 'a' subtree, but it is encountered in the first
|
|
177 |
# tree immediately. Variations on this may have read internal nodes like this.
|
|
178 |
# we want to cut the entire pending subtree when we realise we have a common node.
|
|
179 |
# For this we use a list of keys - the path to a node - and check the entire path is
|
|
180 |
# clean as we process each item.
|
|
181 |
if self._node_key(self._root_node) == self._node_key(basis._root_node): |
|
182 |
return
|
|
183 |
self._ensure_root() |
|
184 |
basis._ensure_root() |
|
185 |
excluded_keys = set() |
|
186 |
self_node = self._root_node |
|
187 |
basis_node = basis._root_node |
|
188 |
# A heap, each element is prefix, node(tuple/NodeObject/string),
|
|
189 |
# key_path (a list of tuples, tail-sharing down the tree.)
|
|
190 |
self_pending = [] |
|
191 |
basis_pending = [] |
|
192 |
def process_node(prefix, node, path, a_map, pending): |
|
193 |
# take a node and expand it
|
|
194 |
node = a_map._get_node(node) |
|
195 |
if type(node) == LeafNode: |
|
196 |
path = (node._key, path) |
|
197 |
for key, value in node._items.items(): |
|
198 |
heapq.heappush(pending, ('\x00'.join(key), value, path)) |
|
199 |
else: |
|
200 |
# type(node) == InternalNode
|
|
201 |
path = (node._key, path) |
|
202 |
for prefix, child in node._items.items(): |
|
203 |
heapq.heappush(pending, (prefix, child, path)) |
|
204 |
process_node(None, self_node, None, self, self_pending) |
|
205 |
process_node(None, basis_node, None, basis, basis_pending) |
|
206 |
self_seen = set() |
|
207 |
basis_seen = set() |
|
208 |
excluded_keys = set() |
|
209 |
def check_excluded(key_path): |
|
210 |
# Note that this is N^2, it depends on us trimming trees
|
|
211 |
# aggressively to not become slow.
|
|
212 |
# A better implementation would probably have a reverse map
|
|
213 |
# back to the children of a node, and jump straight to it when
|
|
214 |
# a common node is detected, the proceed to remove the already
|
|
215 |
# pending children. bzrlib.graph has a searcher module with a
|
|
216 |
# similar problem.
|
|
217 |
while key_path is not None: |
|
218 |
key, key_path = key_path |
|
219 |
if key in excluded_keys: |
|
220 |
return True |
|
221 |
return False |
|
222 |
||
|
3735.2.32
by Robert Collins
Activate test for common node skipping. - 50 times performance improvement. |
223 |
loop_counter = 0 |
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
224 |
while self_pending or basis_pending: |
|
3735.2.32
by Robert Collins
Activate test for common node skipping. - 50 times performance improvement. |
225 |
loop_counter += 1 |
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
226 |
if not self_pending: |
227 |
# self is exhausted: output remainder of basis
|
|
228 |
for prefix, node, path in basis_pending: |
|
229 |
if check_excluded(path): |
|
230 |
continue
|
|
231 |
node = basis._get_node(node) |
|
232 |
if type(node) == str: |
|
233 |
# a value
|
|
234 |
yield (tuple(prefix.split('\x00')), node, None) |
|
235 |
else: |
|
236 |
# subtree - fastpath the entire thing.
|
|
237 |
for key, value in node.iteritems(basis._store): |
|
238 |
yield (key, value, None) |
|
239 |
return
|
|
240 |
elif not basis_pending: |
|
241 |
# basis is exhausted: output remainder of self.
|
|
242 |
for prefix, node, path in self_pending: |
|
243 |
if check_excluded(path): |
|
244 |
continue
|
|
245 |
node = self._get_node(node) |
|
246 |
if type(node) == str: |
|
247 |
# a value
|
|
248 |
yield (tuple(prefix.split('\x00')), None, node) |
|
249 |
else: |
|
250 |
# subtree - fastpath the entire thing.
|
|
251 |
for key, value in node.iteritems(self._store): |
|
252 |
yield (key, None, value) |
|
253 |
return
|
|
254 |
else: |
|
255 |
# XXX: future optimisation - yield the smaller items
|
|
256 |
# immediately rather than pushing everything on/off the
|
|
257 |
# heaps. Applies to both internal nodes and leafnodes.
|
|
258 |
if self_pending[0][0] < basis_pending[0][0]: |
|
259 |
# expand self
|
|
260 |
prefix, node, path = heapq.heappop(self_pending) |
|
261 |
if check_excluded(path): |
|
262 |
continue
|
|
263 |
if type(node) == str: |
|
264 |
# a value
|
|
265 |
yield (tuple(prefix.split('\x00')), None, node) |
|
266 |
else: |
|
267 |
process_node(prefix, node, path, self, self_pending) |
|
268 |
continue
|
|
269 |
elif self_pending[0][0] > basis_pending[0][0]: |
|
270 |
# expand basis
|
|
271 |
prefix, node, path = heapq.heappop(basis_pending) |
|
272 |
if check_excluded(path): |
|
273 |
continue
|
|
274 |
if type(node) == str: |
|
275 |
# a value
|
|
276 |
yield (tuple(prefix.split('\x00')), node, None) |
|
277 |
else: |
|
278 |
process_node(prefix, node, path, basis, basis_pending) |
|
279 |
continue
|
|
280 |
else: |
|
281 |
# common prefix: possibly expand both
|
|
282 |
if type(self_pending[0][1]) != str: |
|
283 |
# process next self
|
|
284 |
read_self = True |
|
285 |
else: |
|
286 |
read_self = False |
|
287 |
if type(basis_pending[0][1]) != str: |
|
288 |
# process next basis
|
|
289 |
read_basis = True |
|
290 |
else: |
|
291 |
read_basis = False |
|
292 |
if not read_self and not read_basis: |
|
293 |
# compare a common value
|
|
294 |
self_details = heapq.heappop(self_pending) |
|
295 |
basis_details = heapq.heappop(basis_pending) |
|
296 |
if self_details[1] != basis_details[1]: |
|
297 |
yield (tuple(self_details[0].split('\x00')), |
|
298 |
basis_details[1], self_details[1]) |
|
299 |
continue
|
|
|
3735.2.32
by Robert Collins
Activate test for common node skipping. - 50 times performance improvement. |
300 |
# At least one side wasn't a string.
|
301 |
if (self._node_key(self_pending[0][1]) == |
|
302 |
self._node_key(basis_pending[0][1])): |
|
303 |
# Identical pointers, skip (and don't bother adding to
|
|
304 |
# excluded, it won't turn up again.
|
|
305 |
heapq.heappop(self_pending) |
|
306 |
heapq.heappop(basis_pending) |
|
307 |
continue
|
|
308 |
# Now we need to expand this node before we can continue
|
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
309 |
if read_self: |
310 |
prefix, node, path = heapq.heappop(self_pending) |
|
311 |
if check_excluded(path): |
|
312 |
continue
|
|
313 |
process_node(prefix, node, path, self, self_pending) |
|
314 |
if read_basis: |
|
315 |
prefix, node, path = heapq.heappop(basis_pending) |
|
316 |
if check_excluded(path): |
|
317 |
continue
|
|
318 |
process_node(prefix, node, path, basis, basis_pending) |
|
|
3735.2.32
by Robert Collins
Activate test for common node skipping. - 50 times performance improvement. |
319 |
# print loop_counter
|
|
3735.2.30
by Robert Collins
Start iter_changes between CHKMap instances. |
320 |
|
|
3735.2.9
by Robert Collins
Get a working chk_map using inventory implementation bootstrapped. |
321 |
def iteritems(self, key_filter=None): |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
322 |
"""Iterate over the entire CHKMap's contents.""" |
323 |
self._ensure_root() |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
324 |
return self._root_node.iteritems(self._store, key_filter=key_filter) |
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
325 |
|
|
3735.2.12
by Robert Collins
Implement commit-via-deltas for split inventory repositories. |
326 |
def key(self): |
327 |
"""Return the key for this map.""" |
|
328 |
if isinstance(self._root_node, tuple): |
|
329 |
return self._root_node |
|
330 |
else: |
|
331 |
return self._root_node._key |
|
332 |
||
|
3735.2.17
by Robert Collins
Cache node length to avoid full iteration on __len__ calls. |
333 |
def __len__(self): |
334 |
self._ensure_root() |
|
335 |
return len(self._root_node) |
|
336 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
337 |
def map(self, key, value): |
338 |
"""Map a key tuple to value.""" |
|
339 |
# Need a root object.
|
|
340 |
self._ensure_root() |
|
341 |
prefix, node_details = self._root_node.map(self._store, key, value) |
|
342 |
if len(node_details) == 1: |
|
343 |
self._root_node = node_details[0][1] |
|
344 |
else: |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
345 |
self._root_node = InternalNode(prefix) |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
346 |
self._root_node.set_maximum_size(node_details[0][1].maximum_size) |
347 |
self._root_node._key_width = node_details[0][1]._key_width |
|
348 |
for split, node in node_details: |
|
349 |
self._root_node.add_node(split, node) |
|
350 |
||
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
351 |
def _node_key(self, node): |
352 |
"""Get the key for a node whether its a tuple o r node.""" |
|
353 |
if type(node) == tuple: |
|
354 |
return node |
|
355 |
else: |
|
356 |
return node._key |
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
357 |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
358 |
def unmap(self, key): |
359 |
"""remove key from the map.""" |
|
360 |
self._ensure_root() |
|
361 |
self._root_node.unmap(self._store, key) |
|
362 |
||
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
363 |
def _save(self): |
364 |
"""Save the map completely. |
|
365 |
||
366 |
:return: The key of the root node.
|
|
367 |
"""
|
|
368 |
if type(self._root_node) == tuple: |
|
369 |
# Already saved.
|
|
370 |
return self._root_node |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
371 |
keys = list(self._root_node.serialise(self._store)) |
372 |
return keys[-1] |
|
|
3735.2.8
by Robert Collins
New chk_map module for use in tree based inventory storage. |
373 |
|
374 |
||
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
375 |
class Node(object): |
376 |
"""Base class defining the protocol for CHK Map nodes.""" |
|
377 |
||
378 |
def __init__(self, key_width=1): |
|
379 |
"""Create a node. |
|
380 |
||
381 |
:param key_width: The width of keys for this node.
|
|
382 |
"""
|
|
383 |
self._key = None |
|
384 |
# Current number of elements
|
|
385 |
self._len = 0 |
|
386 |
self._maximum_size = 0 |
|
387 |
self._key_width = 1 |
|
388 |
# current size in bytes
|
|
389 |
self._size = 0 |
|
390 |
# The pointers/values this node has - meaning defined by child classes.
|
|
391 |
self._items = {} |
|
392 |
||
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
393 |
def __repr__(self): |
394 |
items_str = sorted(self._items) |
|
395 |
if len(items_str) > 20: |
|
396 |
items_str = items_str[16] + '...]' |
|
397 |
return '%s(key:%s len:%s size:%s max:%s items:%s)' % ( |
|
398 |
self.__class__.__name__, self._key, self._len, self._size, |
|
399 |
self._maximum_size, items_str) |
|
400 |
||
|
3735.2.38
by Robert Collins
Sufficient fixes to allow bzr-search to index a dev3 format repository. |
401 |
def key(self): |
402 |
return self._key |
|
403 |
||
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
404 |
def __len__(self): |
405 |
return self._len |
|
406 |
||
407 |
@property
|
|
408 |
def maximum_size(self): |
|
409 |
"""What is the upper limit for adding references to a node.""" |
|
410 |
return self._maximum_size |
|
411 |
||
412 |
def set_maximum_size(self, new_size): |
|
413 |
"""Set the size threshold for nodes. |
|
414 |
||
415 |
:param new_size: The size at which no data is added to a node. 0 for
|
|
416 |
unlimited.
|
|
417 |
"""
|
|
418 |
self._maximum_size = new_size |
|
419 |
||
420 |
||
421 |
class LeafNode(Node): |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
422 |
"""A node containing actual key:value pairs. |
423 |
|
|
424 |
:ivar _items: A dict of key->value items. The key is in tuple form.
|
|
425 |
"""
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
426 |
|
427 |
def __init__(self): |
|
428 |
Node.__init__(self) |
|
429 |
# The size of a leaf node with default values and no children.
|
|
430 |
self._size = 12 |
|
431 |
||
432 |
def _current_size(self): |
|
433 |
"""Answer the current serialised size of this node.""" |
|
434 |
return (self._size + len(str(self._len)) + len(str(self._key_width)) + |
|
435 |
len(str(self._maximum_size))) |
|
436 |
||
437 |
@classmethod
|
|
438 |
def deserialise(klass, bytes, key): |
|
439 |
"""Deserialise bytes, with key key, into a LeafNode. |
|
440 |
||
441 |
:param bytes: The bytes of the node.
|
|
442 |
:param key: The key that the serialised node has.
|
|
443 |
"""
|
|
444 |
result = LeafNode() |
|
445 |
lines = bytes.splitlines() |
|
446 |
items = {} |
|
447 |
if lines[0] != 'chkleaf:': |
|
448 |
raise ValueError("not a serialised leaf node: %r" % bytes) |
|
449 |
maximum_size = int(lines[1]) |
|
450 |
width = int(lines[2]) |
|
451 |
length = int(lines[3]) |
|
452 |
for line in lines[4:]: |
|
|
3735.2.25
by Robert Collins
CHKInventory core tests passing. |
453 |
elements = line.split('\x00', width) |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
454 |
items[tuple(elements[:-1])] = elements[-1] |
455 |
if len(items) != length: |
|
456 |
raise AssertionError("item count mismatch") |
|
457 |
result._items = items |
|
458 |
result._len = length |
|
459 |
result._maximum_size = maximum_size |
|
460 |
result._key = key |
|
461 |
result._key_width = width |
|
462 |
result._size = len(bytes) |
|
463 |
return result |
|
464 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
465 |
def iteritems(self, store, key_filter=None): |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
466 |
"""Iterate over items in the node. |
467 |
||
468 |
:param key_filter: A filter to apply to the node. It should be a
|
|
469 |
list/set/dict or similar repeatedly iterable container.
|
|
470 |
"""
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
471 |
if key_filter is not None: |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
472 |
# Adjust the filter - short elements go to a prefix filter. Would this
|
473 |
# be cleaner explicitly? That would be no harder for InternalNode..
|
|
474 |
# XXX: perhaps defaultdict? Profiling<rinse and repeat>
|
|
475 |
filters = {} |
|
476 |
for key in key_filter: |
|
477 |
length_filter = filters.setdefault(len(key), set()) |
|
478 |
length_filter.add(key) |
|
479 |
filters = filters.items() |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
480 |
for item in self._items.iteritems(): |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
481 |
for length, length_filter in filters: |
482 |
if item[0][:length] in length_filter: |
|
483 |
yield item |
|
484 |
break
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
485 |
else: |
486 |
for item in self._items.iteritems(): |
|
487 |
yield item |
|
488 |
||
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
489 |
def _key_value_len(self, key, value): |
490 |
# TODO: Should probably be done without actually joining the key, but
|
|
491 |
# then that can be done via the C extension
|
|
492 |
return 2 + len('\x00'.join(key)) + len(value) |
|
493 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
494 |
def map(self, store, key, value): |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
495 |
"""Map key to value.""" |
496 |
if key in self._items: |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
497 |
self._size -= self._key_value_len(key, self._items[key]) |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
498 |
self._len -= 1 |
499 |
self._items[key] = value |
|
|
3735.9.4
by John Arbash Meinel
Some small cleanups, and fix _dump_tree to handle in-progress nodes. |
500 |
self._size += self._key_value_len(key, value) |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
501 |
self._len += 1 |
502 |
self._key = None |
|
503 |
if (self._maximum_size and self._current_size() > self._maximum_size and |
|
504 |
self._len > 1): |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
505 |
common_prefix = self.unique_serialised_prefix() |
506 |
split_at = len(common_prefix) + 1 |
|
507 |
result = {} |
|
508 |
for key, value in self._items.iteritems(): |
|
509 |
serialised_key = self._serialised_key(key) |
|
510 |
prefix = serialised_key[:split_at] |
|
511 |
if prefix not in result: |
|
512 |
node = LeafNode() |
|
513 |
node.set_maximum_size(self._maximum_size) |
|
514 |
node._key_width = self._key_width |
|
515 |
result[prefix] = node |
|
516 |
else: |
|
517 |
node = result[prefix] |
|
518 |
node.map(store, key, value) |
|
519 |
return common_prefix, result.items() |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
520 |
else: |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
521 |
return self.unique_serialised_prefix(), [("", self)] |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
522 |
|
523 |
def serialise(self, store): |
|
524 |
"""Serialise the tree to store. |
|
525 |
||
526 |
:param store: A VersionedFiles honouring the CHK extensions.
|
|
527 |
:return: An iterable of the keys inserted by this operation.
|
|
528 |
"""
|
|
529 |
lines = ["chkleaf:\n"] |
|
530 |
lines.append("%d\n" % self._maximum_size) |
|
531 |
lines.append("%d\n" % self._key_width) |
|
532 |
lines.append("%d\n" % self._len) |
|
533 |
for key, value in sorted(self._items.items()): |
|
534 |
lines.append("%s\x00%s\n" % ('\x00'.join(key), value)) |
|
535 |
sha1, _, _ = store.add_lines((None,), (), lines) |
|
536 |
self._key = ("sha1:" + sha1,) |
|
537 |
return [self._key] |
|
538 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
539 |
def _serialised_key(self, key): |
540 |
"""Return the serialised key for key in this node.""" |
|
541 |
return '\x00'.join(key) |
|
542 |
||
|
3735.2.26
by Robert Collins
CHKInventory migrated to new CHKMap code. |
543 |
def refs(self): |
544 |
"""Return the references to other CHK's held by this node.""" |
|
545 |
return [] |
|
546 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
547 |
def unique_serialised_prefix(self): |
548 |
"""Return the unique key prefix for this node. |
|
549 |
||
550 |
:return: A bytestring of the longest serialised key prefix that is
|
|
551 |
unique within this node.
|
|
552 |
"""
|
|
553 |
# may want to cache this eventually :- but wait for enough
|
|
554 |
# functionality to profile.
|
|
555 |
keys = list(self._items.keys()) |
|
556 |
if not keys: |
|
557 |
return "" |
|
558 |
current_prefix = self._serialised_key(keys.pop(-1)) |
|
559 |
while current_prefix and keys: |
|
560 |
next_key = self._serialised_key(keys.pop(-1)) |
|
561 |
for pos, (left, right) in enumerate(zip(current_prefix, next_key)): |
|
562 |
if left != right: |
|
563 |
pos -= 1 |
|
564 |
break
|
|
565 |
current_prefix = current_prefix[:pos + 1] |
|
566 |
return current_prefix |
|
567 |
||
568 |
def unmap(self, store, key): |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
569 |
"""Unmap key from the node.""" |
570 |
self._size -= 2 + len('\x00'.join(key)) + len(self._items[key]) |
|
571 |
self._len -= 1 |
|
572 |
del self._items[key] |
|
573 |
self._key = None |
|
574 |
return self |
|
575 |
||
576 |
||
577 |
class InternalNode(Node): |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
578 |
"""A node that contains references to other nodes. |
579 |
|
|
580 |
An InternalNode is responsible for mapping serialised key prefixes to child
|
|
581 |
nodes. It is greedy - it will defer splitting itself as long as possible.
|
|
582 |
"""
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
583 |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
584 |
def __init__(self, prefix=''): |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
585 |
Node.__init__(self) |
586 |
# The size of an internalnode with default values and no children.
|
|
587 |
# self._size = 12
|
|
588 |
# How many octets key prefixes within this node are.
|
|
589 |
self._node_width = 0 |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
590 |
self._prefix = prefix |
591 |
||
592 |
def __repr__(self): |
|
593 |
items_str = sorted(self._items) |
|
594 |
if len(items_str) > 20: |
|
595 |
items_str = items_str[16] + '...]' |
|
596 |
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % ( |
|
597 |
self.__class__.__name__, self._key, self._len, self._size, |
|
598 |
self._maximum_size, self._prefix, items_str) |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
599 |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
600 |
def add_node(self, prefix, node): |
601 |
"""Add a child node with prefix prefix, and node node. |
|
602 |
||
603 |
:param prefix: The serialised key prefix for node.
|
|
604 |
:param node: The node being added.
|
|
605 |
"""
|
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
606 |
assert self._prefix is not None |
607 |
assert prefix.startswith(self._prefix) |
|
608 |
assert len(prefix) == len(self._prefix) + 1 |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
609 |
self._len += len(node) |
610 |
if not len(self._items): |
|
611 |
self._node_width = len(prefix) |
|
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
612 |
assert self._node_width == len(self._prefix) + 1 |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
613 |
self._items[prefix] = node |
614 |
self._key = None |
|
615 |
||
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
616 |
def _current_size(self): |
617 |
"""Answer the current serialised size of this node.""" |
|
618 |
return (self._size + len(str(self._len)) + len(str(self._key_width)) + |
|
619 |
len(str(self._maximum_size))) |
|
620 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
621 |
@classmethod
|
622 |
def deserialise(klass, bytes, key): |
|
|
3735.2.25
by Robert Collins
CHKInventory core tests passing. |
623 |
"""Deserialise bytes to an InternalNode, with key key. |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
624 |
|
625 |
:param bytes: The bytes of the node.
|
|
626 |
:param key: The key that the serialised node has.
|
|
627 |
:return: An InternalNode instance.
|
|
628 |
"""
|
|
629 |
result = InternalNode() |
|
630 |
lines = bytes.splitlines() |
|
631 |
items = {} |
|
632 |
if lines[0] != 'chknode:': |
|
633 |
raise ValueError("not a serialised internal node: %r" % bytes) |
|
634 |
maximum_size = int(lines[1]) |
|
635 |
width = int(lines[2]) |
|
636 |
length = int(lines[3]) |
|
637 |
for line in lines[4:]: |
|
638 |
prefix, flat_key = line.rsplit('\x00', 1) |
|
639 |
items[prefix] = (flat_key,) |
|
640 |
result._items = items |
|
641 |
result._len = length |
|
642 |
result._maximum_size = maximum_size |
|
643 |
result._key = key |
|
644 |
result._key_width = width |
|
645 |
result._size = len(bytes) |
|
646 |
result._node_width = len(prefix) |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
647 |
result._prefix = result.unique_serialised_prefix() |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
648 |
return result |
649 |
||
650 |
def iteritems(self, store, key_filter=None): |
|
651 |
for node in self._iter_nodes(store, key_filter=key_filter): |
|
652 |
for item in node.iteritems(store, key_filter=key_filter): |
|
653 |
yield item |
|
654 |
||
655 |
def _iter_nodes(self, store, key_filter=None): |
|
656 |
"""Iterate over node objects which match key_filter. |
|
657 |
||
658 |
:param store: A store to use for accessing content.
|
|
659 |
:param key_filter: A key filter to filter nodes. Only nodes that might
|
|
660 |
contain a key in key_filter will be returned.
|
|
661 |
:return: An iterable of nodes.
|
|
662 |
"""
|
|
663 |
nodes = [] |
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
664 |
keys = {} |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
665 |
if key_filter is None: |
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
666 |
for prefix, node in self._items.iteritems(): |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
667 |
if type(node) == tuple: |
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
668 |
keys[node] = prefix |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
669 |
else: |
670 |
nodes.append(node) |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
671 |
else: |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
672 |
# XXX defaultdict ?
|
673 |
length_filters = {} |
|
674 |
for key in key_filter: |
|
675 |
serialised_key = self._serialised_prefix_filter(key) |
|
676 |
length_filter = length_filters.setdefault(len(serialised_key), |
|
677 |
set()) |
|
678 |
length_filter.add(serialised_key) |
|
679 |
length_filters = length_filters.items() |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
680 |
for prefix, node in self._items.iteritems(): |
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
681 |
for length, length_filter in length_filters: |
682 |
if prefix[:length] in length_filter: |
|
683 |
if type(node) == tuple: |
|
684 |
keys[node] = prefix |
|
685 |
else: |
|
686 |
nodes.append(node) |
|
687 |
break
|
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
688 |
if keys: |
689 |
# demand load some pages.
|
|
690 |
stream = store.get_record_stream(keys, 'unordered', True) |
|
691 |
for record in stream: |
|
692 |
node = _deserialise(record.get_bytes_as('fulltext'), record.key) |
|
693 |
nodes.append(node) |
|
|
3735.2.31
by Robert Collins
CHKMap.iter_changes |
694 |
self._items[keys[record.key]] = node |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
695 |
return nodes |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
696 |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
697 |
def map(self, store, key, value): |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
698 |
"""Map key to value.""" |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
699 |
if not len(self._items): |
700 |
raise AssertionError("cant map in an empty InternalNode.") |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
701 |
serialised_key = self._serialised_key(key) |
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
702 |
assert self._node_width == len(self._prefix) + 1 |
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
703 |
if not serialised_key.startswith(self._prefix): |
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
704 |
# This key doesn't fit in this index, so we need to split at the
|
705 |
# point where it would fit.
|
|
706 |
# XXX: Do we need the serialised_key in its maximum length?
|
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
707 |
new_prefix = self.unique_serialised_prefix(serialised_key) |
708 |
new_parent = InternalNode(new_prefix) |
|
709 |
new_parent.set_maximum_size(self._maximum_size) |
|
710 |
new_parent._key_width = self._key_width |
|
711 |
new_parent.add_node(self._prefix[:len(new_prefix)+1], self) |
|
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
712 |
assert new_parent._node_width == len(new_parent._prefix) + 1 |
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
713 |
return new_parent.map(store, key, value) |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
714 |
children = self._iter_nodes(store, key_filter=[key]) |
715 |
if children: |
|
716 |
child = children[0] |
|
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
717 |
# if isinstance(child, InternalNode):
|
718 |
# child_prefix = child._prefix
|
|
719 |
# child_ser_key = child._serialised_key(key)
|
|
720 |
# if not child_ser_key.startswith(child_prefix):
|
|
721 |
# import pdb; pdb.set_trace()
|
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
722 |
else: |
723 |
# new child needed:
|
|
724 |
child = self._new_child(serialised_key, LeafNode) |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
725 |
old_len = len(child) |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
726 |
prefix, node_details = child.map(store, key, value) |
727 |
if len(node_details) == 1: |
|
|
3735.9.11
by John Arbash Meinel
Handle when an InternalNode decides it needs to split. |
728 |
# child may have shrunk, or might be a new node
|
729 |
child = node_details[0][1] |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
730 |
self._len = self._len - old_len + len(child) |
731 |
self._items[serialised_key] = child |
|
|
3735.2.29
by Robert Collins
Untested code is broken code. |
732 |
self._key = None |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
733 |
return self.unique_serialised_prefix(), [("", self)] |
734 |
# child has overflown - create a new intermediate node.
|
|
735 |
# XXX: This is where we might want to try and expand our depth
|
|
736 |
# to refer to more bytes of every child (which would give us
|
|
737 |
# multiple pointers to child nodes, but less intermediate nodes)
|
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
738 |
# TODO: Is this mapped as serialised_key or as prefix?
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
739 |
child = self._new_child(serialised_key, InternalNode) |
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
740 |
child._prefix = prefix |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
741 |
for split, node in node_details: |
742 |
child.add_node(split, node) |
|
743 |
self._len = self._len - old_len + len(child) |
|
|
3735.2.29
by Robert Collins
Untested code is broken code. |
744 |
self._key = None |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
745 |
return self.unique_serialised_prefix(), [("", self)] |
746 |
||
747 |
def _new_child(self, serialised_key, klass): |
|
748 |
"""Create a new child node of type klass.""" |
|
749 |
child = klass() |
|
750 |
child.set_maximum_size(self._maximum_size) |
|
751 |
child._key_width = self._key_width |
|
752 |
self._items[serialised_key] = child |
|
753 |
return child |
|
754 |
||
755 |
def serialise(self, store): |
|
756 |
"""Serialise the node to store. |
|
757 |
||
758 |
:param store: A VersionedFiles honouring the CHK extensions.
|
|
759 |
:return: An iterable of the keys inserted by this operation.
|
|
760 |
"""
|
|
761 |
for node in self._items.itervalues(): |
|
762 |
if type(node) == tuple: |
|
763 |
# Never deserialised.
|
|
764 |
continue
|
|
765 |
if node._key is not None: |
|
766 |
# Never altered
|
|
767 |
continue
|
|
768 |
for key in node.serialise(store): |
|
769 |
yield key |
|
770 |
lines = ["chknode:\n"] |
|
771 |
lines.append("%d\n" % self._maximum_size) |
|
772 |
lines.append("%d\n" % self._key_width) |
|
773 |
lines.append("%d\n" % self._len) |
|
774 |
for prefix, node in sorted(self._items.items()): |
|
775 |
if type(node) == tuple: |
|
776 |
key = node[0] |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
777 |
else: |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
778 |
key = node._key[0] |
779 |
lines.append("%s\x00%s\n" % (prefix, key)) |
|
780 |
sha1, _, _ = store.add_lines((None,), (), lines) |
|
781 |
self._key = ("sha1:" + sha1,) |
|
782 |
yield self._key |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
783 |
|
784 |
def _serialised_key(self, key): |
|
785 |
"""Return the serialised key for key in this node.""" |
|
786 |
return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width] |
|
787 |
||
|
3735.2.43
by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces. |
788 |
def _serialised_prefix_filter(self, key): |
789 |
"""Serialise key for use as a prefix filter in iteritems.""" |
|
790 |
if len(key) == self._key_width: |
|
791 |
return self._serialised_key(key) |
|
792 |
return '\x00'.join(key)[:self._node_width] |
|
793 |
||
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
794 |
def _split(self, offset): |
795 |
"""Split this node into smaller nodes starting at offset. |
|
796 |
||
797 |
:param offset: The offset to start the new child nodes at.
|
|
798 |
:return: An iterable of (prefix, node) tuples. prefix is a byte
|
|
799 |
prefix for reaching node.
|
|
800 |
"""
|
|
801 |
if offset >= self._node_width: |
|
802 |
for node in self._items.values(): |
|
803 |
for result in node._split(offset): |
|
804 |
yield result |
|
805 |
return
|
|
806 |
for key, node in self._items.items(): |
|
807 |
pass
|
|
808 |
||
|
3735.2.26
by Robert Collins
CHKInventory migrated to new CHKMap code. |
809 |
def refs(self): |
810 |
"""Return the references to other CHK's held by this node.""" |
|
811 |
if self._key is None: |
|
812 |
raise AssertionError("unserialised nodes have no refs.") |
|
813 |
refs = [] |
|
814 |
for value in self._items.itervalues(): |
|
815 |
if type(value) == tuple: |
|
816 |
refs.append(value) |
|
817 |
else: |
|
818 |
refs.append(value.key()) |
|
819 |
return refs |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
820 |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
821 |
def unique_serialised_prefix(self, extra_key=None): |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
822 |
"""Return the unique key prefix for this node. |
823 |
||
824 |
:return: A bytestring of the longest serialised key prefix that is
|
|
825 |
unique within this node.
|
|
826 |
"""
|
|
827 |
# may want to cache this eventually :- but wait for enough
|
|
828 |
# functionality to profile.
|
|
829 |
keys = list(self._items.keys()) |
|
|
3735.9.5
by John Arbash Meinel
Don't allow an InternalNode to add a key that doesn't fit. |
830 |
if extra_key is not None: |
831 |
keys.append(extra_key) |
|
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
832 |
if not keys: |
833 |
return "" |
|
834 |
current_prefix = keys.pop(-1) |
|
835 |
while current_prefix and keys: |
|
836 |
next_key = keys.pop(-1) |
|
837 |
for pos, (left, right) in enumerate(zip(current_prefix, next_key)): |
|
838 |
if left != right: |
|
839 |
pos -= 1 |
|
840 |
break
|
|
841 |
current_prefix = current_prefix[:pos + 1] |
|
842 |
return current_prefix |
|
843 |
||
844 |
def unmap(self, store, key): |
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
845 |
"""Remove key from this node and it's children.""" |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
846 |
if not len(self._items): |
847 |
raise AssertionError("cant unmap in an empty InternalNode.") |
|
848 |
serialised_key = self._serialised_key(key) |
|
849 |
children = self._iter_nodes(store, key_filter=[key]) |
|
850 |
serialised_key = self._serialised_key(key) |
|
851 |
if children: |
|
852 |
child = children[0] |
|
853 |
else: |
|
854 |
raise KeyError(key) |
|
855 |
self._len -= 1 |
|
856 |
unmapped = child.unmap(store, key) |
|
857 |
if len(unmapped) == 0: |
|
858 |
# All child nodes are gone, remove the child:
|
|
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
859 |
del self._items[serialised_key] |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
860 |
else: |
861 |
# Stash the returned node
|
|
862 |
self._items[serialised_key] = unmapped |
|
|
3735.2.23
by Robert Collins
Test unmapping with one child left but multiple keys. |
863 |
if len(self._items) == 1: |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
864 |
# this node is no longer needed:
|
865 |
return self._items.values()[0] |
|
|
3735.2.29
by Robert Collins
Untested code is broken code. |
866 |
self._key = None |
|
3735.2.18
by Robert Collins
Partial multi-layer chk dictionary trees. |
867 |
return self |
868 |
||
869 |
||
|
3735.2.16
by Robert Collins
Untested extensions to support repodetails |
870 |
def _deserialise(bytes, key): |
871 |
"""Helper for repositorydetails - convert bytes to a node.""" |
|
|
3735.2.24
by Robert Collins
test_chk_map tests all passing. |
872 |
if bytes.startswith("chkleaf:\n"): |
|
3735.2.21
by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux. |
873 |
return LeafNode.deserialise(bytes, key) |
874 |
elif bytes.startswith("chknode:\n"): |
|
875 |
return InternalNode.deserialise(bytes, key) |
|
|
3735.2.16
by Robert Collins
Untested extensions to support repodetails |
876 |
else: |
877 |
raise AssertionError("Unknown node type.") |
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
878 |
|
879 |
||
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
880 |
def _find_children_info(store, interesting_keys, uninteresting_keys, |
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
881 |
adapter, pb): |
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
882 |
"""Read the associated records, and determine what is interesting.""" |
883 |
uninteresting_keys = set(uninteresting_keys) |
|
884 |
chks_to_read = uninteresting_keys.union(interesting_keys) |
|
885 |
next_uninteresting = set() |
|
886 |
next_interesting = set() |
|
887 |
uninteresting_items = set() |
|
888 |
interesting_items = set() |
|
889 |
interesting_records = [] |
|
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
890 |
# records_read = set()
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
891 |
for record in store.get_record_stream(chks_to_read, 'unordered', True): |
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
892 |
# records_read.add(record.key())
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
893 |
if pb is not None: |
894 |
pb.tick() |
|
895 |
if record.storage_kind != 'fulltext': |
|
896 |
bytes = adapter.get_bytes(record, |
|
897 |
record.get_bytes_as(record.storage_kind)) |
|
898 |
else: |
|
899 |
bytes = record.get_bytes_as('fulltext') |
|
900 |
node = _deserialise(bytes, record.key) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
901 |
if record.key in uninteresting_keys: |
902 |
if isinstance(node, InternalNode): |
|
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
903 |
next_uninteresting.update(node.refs()) |
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
904 |
else: |
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
905 |
# We know we are at a LeafNode, so we can pass None for the
|
906 |
# store
|
|
907 |
uninteresting_items.update(node.iteritems(None)) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
908 |
else: |
909 |
interesting_records.append(record) |
|
910 |
if isinstance(node, InternalNode): |
|
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
911 |
next_interesting.update(node.refs()) |
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
912 |
else: |
|
3735.9.14
by John Arbash Meinel
Start using the iter_interesting_nodes. |
913 |
interesting_items.update(node.iteritems(None)) |
914 |
# TODO: Filter out records that have already been read, as node splitting
|
|
915 |
# can cause us to reference the same nodes via shorter and longer
|
|
916 |
# paths
|
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
917 |
return (next_uninteresting, uninteresting_items, |
918 |
next_interesting, interesting_records, interesting_items) |
|
919 |
||
920 |
||
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
921 |
def _find_all_uninteresting(store, interesting_root_keys, |
922 |
uninteresting_root_keys, adapter, pb): |
|
923 |
"""Determine the full set of uninteresting keys.""" |
|
924 |
# What about duplicates between interesting_root_keys and
|
|
925 |
# uninteresting_root_keys?
|
|
926 |
if not uninteresting_root_keys: |
|
927 |
# Shortcut case. We know there is nothing uninteresting to filter out
|
|
928 |
# So we just let the rest of the algorithm do the work
|
|
929 |
# We know there is nothing uninteresting, and we didn't have to read
|
|
930 |
# any interesting records yet.
|
|
931 |
return (set(), set(), set(interesting_root_keys), [], set()) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
932 |
all_uninteresting_chks = set(uninteresting_root_keys) |
933 |
all_uninteresting_items = set() |
|
934 |
||
935 |
# First step, find the direct children of both the interesting and
|
|
936 |
# uninteresting set
|
|
937 |
(uninteresting_keys, uninteresting_items, |
|
938 |
interesting_keys, interesting_records, |
|
939 |
interesting_items) = _find_children_info(store, interesting_root_keys, |
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
940 |
uninteresting_root_keys, |
941 |
adapter=adapter, pb=pb) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
942 |
all_uninteresting_chks.update(uninteresting_keys) |
943 |
all_uninteresting_items.update(uninteresting_items) |
|
944 |
del uninteresting_items |
|
945 |
# Note: Exact matches between interesting and uninteresting do not need
|
|
946 |
# to be search further. Non-exact matches need to be searched in case
|
|
947 |
# there is a future exact-match
|
|
948 |
uninteresting_keys.difference_update(interesting_keys) |
|
949 |
||
|
3735.9.6
by John Arbash Meinel
Add a first pass to the interesting search. |
950 |
# Second, find the full set of uninteresting bits reachable by the
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
951 |
# uninteresting roots
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
952 |
chks_to_read = uninteresting_keys |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
953 |
while chks_to_read: |
954 |
next_chks = set() |
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
955 |
for record in store.get_record_stream(chks_to_read, 'unordered', False): |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
956 |
# TODO: Handle 'absent'
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
957 |
if pb is not None: |
958 |
pb.tick() |
|
959 |
if record.storage_kind != 'fulltext': |
|
960 |
bytes = adapter.get_bytes(record, |
|
961 |
record.get_bytes_as(record.storage_kind)) |
|
962 |
else: |
|
963 |
bytes = record.get_bytes_as('fulltext') |
|
964 |
node = _deserialise(bytes, record.key) |
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
965 |
if isinstance(node, InternalNode): |
966 |
# uninteresting_prefix_chks.update(node._items.iteritems())
|
|
967 |
chks = node._items.values() |
|
968 |
# TODO: We remove the entries that are already in
|
|
969 |
# uninteresting_chks ?
|
|
970 |
next_chks.update(chks) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
971 |
all_uninteresting_chks.update(chks) |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
972 |
else: |
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
973 |
all_uninteresting_items.update(node._items.iteritems()) |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
974 |
chks_to_read = next_chks |
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
975 |
return (all_uninteresting_chks, all_uninteresting_items, |
976 |
interesting_keys, interesting_records, interesting_items) |
|
977 |
||
978 |
||
979 |
def iter_interesting_nodes(store, interesting_root_keys, |
|
980 |
uninteresting_root_keys, pb=None): |
|
981 |
"""Given root keys, find interesting nodes. |
|
982 |
||
983 |
Evaluate nodes referenced by interesting_root_keys. Ones that are also
|
|
984 |
referenced from uninteresting_root_keys are not considered interesting.
|
|
985 |
||
986 |
:param interesting_root_keys: keys which should be part of the
|
|
987 |
"interesting" nodes (which will be yielded)
|
|
988 |
:param uninteresting_root_keys: keys which should be filtered out of the
|
|
989 |
result set.
|
|
990 |
:return: Yield
|
|
991 |
(interesting records, interesting chk's, interesting key:values)
|
|
992 |
"""
|
|
993 |
# TODO: consider that it may be more memory efficient to use the 20-byte
|
|
994 |
# sha1 string, rather than tuples of hexidecimal sha1 strings.
|
|
995 |
||
996 |
# A way to adapt from the compressed texts back into fulltexts
|
|
997 |
# In a way, this seems like a layering inversion to have CHKMap know the
|
|
998 |
# details of versionedfile
|
|
999 |
adapter_class = versionedfile.adapter_registry.get( |
|
1000 |
('knit-ft-gz', 'fulltext')) |
|
1001 |
adapter = adapter_class(store) |
|
1002 |
||
1003 |
(all_uninteresting_chks, all_uninteresting_items, interesting_keys, |
|
1004 |
interesting_records, interesting_items) = _find_all_uninteresting(store, |
|
1005 |
interesting_root_keys, uninteresting_root_keys, adapter, pb) |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
1006 |
|
1007 |
# Now that we know everything uninteresting, we can yield information from
|
|
1008 |
# our first request
|
|
1009 |
interesting_items.difference_update(all_uninteresting_items) |
|
1010 |
records = dict((record.key, record) for record in interesting_records |
|
1011 |
if record.key not in all_uninteresting_chks) |
|
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
1012 |
if records or interesting_items: |
1013 |
yield records, interesting_items |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
1014 |
interesting_keys.difference_update(all_uninteresting_chks) |
1015 |
||
1016 |
chks_to_read = interesting_keys |
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
1017 |
while chks_to_read: |
1018 |
next_chks = set() |
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
1019 |
for record in store.get_record_stream(chks_to_read, 'unordered', False): |
1020 |
if pb is not None: |
|
1021 |
pb.tick() |
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
1022 |
# TODO: Handle 'absent'?
|
|
3735.9.17
by John Arbash Meinel
Pass around a progress bar and switch to using an adapter. |
1023 |
if record.storage_kind != 'fulltext': |
1024 |
bytes = adapter.get_bytes(record, |
|
1025 |
record.get_bytes_as(record.storage_kind)) |
|
1026 |
else: |
|
1027 |
bytes = record.get_bytes_as('fulltext') |
|
1028 |
node = _deserialise(bytes, record.key) |
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
1029 |
if isinstance(node, InternalNode): |
|
3735.9.15
by John Arbash Meinel
Found a bug in iter_interesting_nodes and its test suite. |
1030 |
chks = set(node.refs()) |
1031 |
chks.difference_update(all_uninteresting_chks) |
|
1032 |
# Is set() and .difference_update better than:
|
|
1033 |
# chks = [chk for chk in node.refs()
|
|
1034 |
# if chk not in all_uninteresting_chks]
|
|
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
1035 |
next_chks.update(chks) |
1036 |
# These are now uninteresting everywhere else
|
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
1037 |
all_uninteresting_chks.update(chks) |
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
1038 |
interesting_items = [] |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
1039 |
else: |
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
1040 |
interesting_items = [item for item in node._items.iteritems() |
1041 |
if item not in all_uninteresting_items] |
|
|
3735.9.15
by John Arbash Meinel
Found a bug in iter_interesting_nodes and its test suite. |
1042 |
# TODO: Do we need to filter out items that we have already
|
1043 |
# seen on other pages? We don't really want to buffer the
|
|
1044 |
# whole thing, but it does mean that callers need to
|
|
1045 |
# understand they may get duplicate values.
|
|
|
3735.9.7
by John Arbash Meinel
Cleanup pass. |
1046 |
# all_uninteresting_items.update(interesting_items)
|
|
3735.9.19
by John Arbash Meinel
Refactor iter_interesting a little bit. |
1047 |
yield {record.key: record}, interesting_items |
|
3735.9.1
by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit |
1048 |
chks_to_read = next_chks |