bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1  | 
# Copyright (C) 2008 Canonical Ltd
 | 
2  | 
#
 | 
|
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
4  | 
# it under the terms of the GNU General Public License as published by
 | 
|
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
7  | 
#
 | 
|
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
#
 | 
|
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
15  | 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
|
16  | 
||
17  | 
"""Persistent maps from tuple_of_strings->string using CHK stores.
 | 
|
18  | 
||
19  | 
Overview and current status:
 | 
|
20  | 
||
21  | 
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
 | 
|
22  | 
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
 | 
|
23  | 
joining them by \x00, and \x00 padding shorter keys out to the length of the
 | 
|
24  | 
longest key. Leaf nodes are packed as densely as possible, and internal nodes
 | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
25  | 
are all an additional 8-bits wide leading to a sparse upper tree.
 | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
26  | 
|
27  | 
Updates to a CHKMap are done preferentially via the apply_delta method, to
 | 
|
28  | 
allow optimisation of the update operation; but individual map/unmap calls are
 | 
|
29  | 
possible and supported. All changes via map/unmap are buffered in memory until
 | 
|
30  | 
the _save method is called to force serialisation of the tree. apply_delta
 | 
|
31  | 
performs a _save implicitly.
 | 
|
32  | 
||
33  | 
TODO:
 | 
|
34  | 
-----
 | 
|
35  | 
||
36  | 
Densely packed upper nodes.
 | 
|
37  | 
||
38  | 
"""
 | 
|
39  | 
||
40  | 
import heapq  | 
|
41  | 
import time  | 
|
42  | 
||
43  | 
from bzrlib import lazy_import  | 
|
44  | 
lazy_import.lazy_import(globals(), """  | 
|
45  | 
from bzrlib import versionedfile
 | 
|
46  | 
""")  | 
|
47  | 
from bzrlib import (  | 
|
48  | 
errors,  | 
|
49  | 
lru_cache,  | 
|
50  | 
osutils,  | 
|
51  | 
registry,  | 
|
52  | 
trace,  | 
|
53  | 
    )
 | 
|
54  | 
||
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
55  | 
# approx 4MB
 | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
56  | 
# If each line is 50 bytes, and you have 255 internal pages, with 255-way fan
 | 
57  | 
# out, it takes 3.1MB to cache the layer.
 | 
|
58  | 
_PAGE_CACHE_SIZE = 4*1024*1024  | 
|
59  | 
# We are caching bytes so len(value) is perfectly accurate
 | 
|
60  | 
_page_cache = lru_cache.LRUSizeCache(_PAGE_CACHE_SIZE)  | 
|
61  | 
||
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
62  | 
# If a ChildNode falls below this many bytes, we check for a remap
 | 
63  | 
_INTERESTING_NEW_SIZE = 50  | 
|
64  | 
# If a ChildNode shrinks by more than this amount, we check for a remap
 | 
|
65  | 
_INTERESTING_SHRINKAGE_LIMIT = 20  | 
|
66  | 
# If we delete more than this many nodes applying a delta, we check for a remap
 | 
|
67  | 
_INTERESTING_DELETES_LIMIT = 5  | 
|
68  | 
||
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
69  | 
|
70  | 
def _search_key_plain(key):  | 
|
71  | 
"""Map the key tuple into a search string that just uses the key bytes."""  | 
|
72  | 
return '\x00'.join(key)  | 
|
73  | 
||
74  | 
||
75  | 
search_key_registry = registry.Registry()  | 
|
76  | 
search_key_registry.register('plain', _search_key_plain)  | 
|
77  | 
||
78  | 
||
79  | 
class CHKMap(object):  | 
|
80  | 
"""A persistent map from string to string backed by a CHK store."""  | 
|
81  | 
||
82  | 
def __init__(self, store, root_key, search_key_func=None):  | 
|
83  | 
"""Create a CHKMap object.  | 
|
84  | 
||
85  | 
        :param store: The store the CHKMap is stored in.
 | 
|
86  | 
        :param root_key: The root key of the map. None to create an empty
 | 
|
87  | 
            CHKMap.
 | 
|
88  | 
        :param search_key_func: A function mapping a key => bytes. These bytes
 | 
|
89  | 
            are then used by the internal nodes to split up leaf nodes into
 | 
|
90  | 
            multiple pages.
 | 
|
91  | 
        """
 | 
|
92  | 
self._store = store  | 
|
93  | 
if search_key_func is None:  | 
|
94  | 
search_key_func = _search_key_plain  | 
|
95  | 
self._search_key_func = search_key_func  | 
|
96  | 
if root_key is None:  | 
|
97  | 
self._root_node = LeafNode(search_key_func=search_key_func)  | 
|
98  | 
else:  | 
|
99  | 
self._root_node = self._node_key(root_key)  | 
|
100  | 
||
101  | 
def apply_delta(self, delta):  | 
|
102  | 
"""Apply a delta to the map.  | 
|
103  | 
||
104  | 
        :param delta: An iterable of old_key, new_key, new_value tuples.
 | 
|
105  | 
            If new_key is not None, then new_key->new_value is inserted
 | 
|
106  | 
            into the map; if old_key is not None, then the old mapping
 | 
|
107  | 
            of old_key is removed.
 | 
|
108  | 
        """
 | 
|
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
109  | 
delete_count = 0  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
110  | 
for old, new, value in delta:  | 
111  | 
if old is not None and old != new:  | 
|
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
112  | 
self.unmap(old, check_remap=False)  | 
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
113  | 
delete_count += 1  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
114  | 
for old, new, value in delta:  | 
115  | 
if new is not None:  | 
|
116  | 
self.map(new, value)  | 
|
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
117  | 
if delete_count > _INTERESTING_DELETES_LIMIT:  | 
118  | 
trace.mutter("checking remap as %d deletions", delete_count)  | 
|
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
119  | 
self._check_remap()  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
120  | 
return self._save()  | 
121  | 
||
122  | 
def _ensure_root(self):  | 
|
123  | 
"""Ensure that the root node is an object not a key."""  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
124  | 
if type(self._root_node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
125  | 
            # Demand-load the root
 | 
126  | 
self._root_node = self._get_node(self._root_node)  | 
|
127  | 
||
128  | 
def _get_node(self, node):  | 
|
129  | 
"""Get a node.  | 
|
130  | 
||
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
131  | 
        Note that this does not update the _items dict in objects containing a
 | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
132  | 
        reference to this node. As such it does not prevent subsequent IO being
 | 
133  | 
        performed.
 | 
|
134  | 
||
135  | 
        :param node: A tuple key or node object.
 | 
|
136  | 
        :return: A node object.
 | 
|
137  | 
        """
 | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
138  | 
if type(node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
139  | 
bytes = self._read_bytes(node)  | 
140  | 
return _deserialise(bytes, node,  | 
|
141  | 
search_key_func=self._search_key_func)  | 
|
142  | 
else:  | 
|
143  | 
return node  | 
|
144  | 
||
145  | 
def _read_bytes(self, key):  | 
|
| 
3735.2.124
by Ian Clatworthy
 use the page cache in CHKMap._read_bytes()  | 
146  | 
try:  | 
147  | 
return _page_cache[key]  | 
|
148  | 
except KeyError:  | 
|
149  | 
stream = self._store.get_record_stream([key], 'unordered', True)  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
150  | 
bytes = stream.next().get_bytes_as('fulltext')  | 
151  | 
_page_cache[key] = bytes  | 
|
152  | 
return bytes  | 
|
153  | 
||
154  | 
def _dump_tree(self, include_keys=False):  | 
|
155  | 
"""Return the tree in a string representation."""  | 
|
156  | 
self._ensure_root()  | 
|
157  | 
res = self._dump_tree_node(self._root_node, prefix='', indent='',  | 
|
158  | 
include_keys=include_keys)  | 
|
159  | 
res.append('') # Give a trailing '\n'  | 
|
160  | 
return '\n'.join(res)  | 
|
161  | 
||
162  | 
def _dump_tree_node(self, node, prefix, indent, include_keys=True):  | 
|
163  | 
"""For this node and all children, generate a string representation."""  | 
|
164  | 
result = []  | 
|
165  | 
if not include_keys:  | 
|
166  | 
key_str = ''  | 
|
167  | 
else:  | 
|
168  | 
node_key = node.key()  | 
|
169  | 
if node_key is not None:  | 
|
170  | 
key_str = ' %s' % (node_key[0],)  | 
|
171  | 
else:  | 
|
172  | 
key_str = ' None'  | 
|
173  | 
result.append('%s%r %s%s' % (indent, prefix, node.__class__.__name__,  | 
|
174  | 
key_str))  | 
|
175  | 
if type(node) is InternalNode:  | 
|
176  | 
            # Trigger all child nodes to get loaded
 | 
|
177  | 
list(node._iter_nodes(self._store))  | 
|
178  | 
for prefix, sub in sorted(node._items.iteritems()):  | 
|
179  | 
result.extend(self._dump_tree_node(sub, prefix, indent + ' ',  | 
|
180  | 
include_keys=include_keys))  | 
|
181  | 
else:  | 
|
182  | 
for key, value in sorted(node._items.iteritems()):  | 
|
183  | 
                # Don't use prefix nor indent here to line up when used in
 | 
|
184  | 
                # tests in conjunction with assertEqualDiff
 | 
|
185  | 
result.append(' %r %r' % (key, value))  | 
|
186  | 
return result  | 
|
187  | 
||
188  | 
    @classmethod
 | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
189  | 
def from_dict(klass, store, initial_value, maximum_size=0, key_width=1,  | 
190  | 
search_key_func=None):  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
191  | 
"""Create a CHKMap in store with initial_value as the content.  | 
192  | 
||
193  | 
        :param store: The store to record initial_value in, a VersionedFiles
 | 
|
194  | 
            object with 1-tuple keys supporting CHK key generation.
 | 
|
195  | 
        :param initial_value: A dict to store in store. Its keys and values
 | 
|
196  | 
            must be bytestrings.
 | 
|
197  | 
        :param maximum_size: The maximum_size rule to apply to nodes. This
 | 
|
198  | 
            determines the size at which no new data is added to a single node.
 | 
|
199  | 
        :param key_width: The number of elements in each key_tuple being stored
 | 
|
200  | 
            in this map.
 | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
201  | 
        :param search_key_func: A function mapping a key => bytes. These bytes
 | 
202  | 
            are then used by the internal nodes to split up leaf nodes into
 | 
|
203  | 
            multiple pages.
 | 
|
204  | 
        :return: The root chk of the resulting CHKMap.
 | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
205  | 
        """
 | 
| 
4413.5.7
by John Arbash Meinel
 Switch to using a single code path for from_dict().  | 
206  | 
root_key = klass._create_directly(store, initial_value,  | 
207  | 
maximum_size=maximum_size, key_width=key_width,  | 
|
208  | 
search_key_func=search_key_func)  | 
|
| 
4413.5.5
by John Arbash Meinel
 Make it more obvious how the two creation methods are defined.  | 
209  | 
return root_key  | 
210  | 
||
211  | 
    @classmethod
 | 
|
212  | 
def _create_via_map(klass, store, initial_value, maximum_size=0,  | 
|
213  | 
key_width=1, search_key_func=None):  | 
|
214  | 
result = klass(store, None, search_key_func=search_key_func)  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
215  | 
result._root_node.set_maximum_size(maximum_size)  | 
216  | 
result._root_node._key_width = key_width  | 
|
217  | 
delta = []  | 
|
218  | 
for key, value in initial_value.items():  | 
|
219  | 
delta.append((None, key, value))  | 
|
| 
4413.5.4
by John Arbash Meinel
 Change CHKMap.from_dict to create a LeafNode and split it.  | 
220  | 
root_key = result.apply_delta(delta)  | 
| 
4413.5.5
by John Arbash Meinel
 Make it more obvious how the two creation methods are defined.  | 
221  | 
return root_key  | 
222  | 
||
223  | 
    @classmethod
 | 
|
224  | 
def _create_directly(klass, store, initial_value, maximum_size=0,  | 
|
225  | 
key_width=1, search_key_func=None):  | 
|
| 
4413.5.4
by John Arbash Meinel
 Change CHKMap.from_dict to create a LeafNode and split it.  | 
226  | 
node = LeafNode(search_key_func=search_key_func)  | 
227  | 
node.set_maximum_size(maximum_size)  | 
|
228  | 
node._key_width = key_width  | 
|
229  | 
node._items = dict(initial_value)  | 
|
230  | 
node._raw_size = sum([node._key_value_len(key, value)  | 
|
231  | 
for key,value in initial_value.iteritems()])  | 
|
232  | 
node._len = len(node._items)  | 
|
233  | 
node._compute_search_prefix()  | 
|
234  | 
node._compute_serialised_prefix()  | 
|
235  | 
if (node._len > 1  | 
|
236  | 
and maximum_size  | 
|
237  | 
and node._current_size() > maximum_size):  | 
|
238  | 
prefix, node_details = node._split(store)  | 
|
| 
4413.5.8
by John Arbash Meinel
 Change some asserts into raise: calls.  | 
239  | 
if len(node_details) == 1:  | 
240  | 
raise AssertionError('Failed to split using node._split')  | 
|
| 
4413.5.4
by John Arbash Meinel
 Change CHKMap.from_dict to create a LeafNode and split it.  | 
241  | 
node = InternalNode(prefix, search_key_func=search_key_func)  | 
242  | 
node.set_maximum_size(maximum_size)  | 
|
243  | 
node._key_width = key_width  | 
|
244  | 
for split, subnode in node_details:  | 
|
245  | 
node.add_node(split, subnode)  | 
|
246  | 
keys = list(node.serialise(store))  | 
|
| 
4413.5.5
by John Arbash Meinel
 Make it more obvious how the two creation methods are defined.  | 
247  | 
return keys[-1]  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
248  | 
|
249  | 
def iter_changes(self, basis):  | 
|
250  | 
"""Iterate over the changes between basis and self.  | 
|
251  | 
||
252  | 
        :return: An iterator of tuples: (key, old_value, new_value). Old_value
 | 
|
253  | 
            is None for keys only in self; new_value is None for keys only in
 | 
|
254  | 
            basis.
 | 
|
255  | 
        """
 | 
|
256  | 
        # Overview:
 | 
|
257  | 
        # Read both trees in lexographic, highest-first order.
 | 
|
258  | 
        # Any identical nodes we skip
 | 
|
259  | 
        # Any unique prefixes we output immediately.
 | 
|
260  | 
        # values in a leaf node are treated as single-value nodes in the tree
 | 
|
261  | 
        # which allows them to be not-special-cased. We know to output them
 | 
|
262  | 
        # because their value is a string, not a key(tuple) or node.
 | 
|
263  | 
        #
 | 
|
264  | 
        # corner cases to beware of when considering this function:
 | 
|
265  | 
        # *) common references are at different heights.
 | 
|
266  | 
        #    consider two trees:
 | 
|
267  | 
        #    {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
 | 
|
268  | 
        #    {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'},
 | 
|
269  | 
        #                        'ab':LeafNode={'ab':'bar'}}
 | 
|
270  | 
        #     'b': LeafNode={'b'}}
 | 
|
271  | 
        #    the node with aaa/aab will only be encountered in the second tree
 | 
|
272  | 
        #    after reading the 'a' subtree, but it is encountered in the first
 | 
|
273  | 
        #    tree immediately. Variations on this may have read internal nodes
 | 
|
274  | 
        #    like this.  we want to cut the entire pending subtree when we
 | 
|
275  | 
        #    realise we have a common node.  For this we use a list of keys -
 | 
|
276  | 
        #    the path to a node - and check the entire path is clean as we
 | 
|
277  | 
        #    process each item.
 | 
|
278  | 
if self._node_key(self._root_node) == self._node_key(basis._root_node):  | 
|
279  | 
            return
 | 
|
280  | 
self._ensure_root()  | 
|
281  | 
basis._ensure_root()  | 
|
282  | 
excluded_keys = set()  | 
|
283  | 
self_node = self._root_node  | 
|
284  | 
basis_node = basis._root_node  | 
|
285  | 
        # A heap, each element is prefix, node(tuple/NodeObject/string),
 | 
|
286  | 
        # key_path (a list of tuples, tail-sharing down the tree.)
 | 
|
287  | 
self_pending = []  | 
|
288  | 
basis_pending = []  | 
|
289  | 
def process_node(node, path, a_map, pending):  | 
|
290  | 
            # take a node and expand it
 | 
|
291  | 
node = a_map._get_node(node)  | 
|
292  | 
if type(node) == LeafNode:  | 
|
293  | 
path = (node._key, path)  | 
|
294  | 
for key, value in node._items.items():  | 
|
295  | 
                    # For a LeafNode, the key is a serialized_key, rather than
 | 
|
296  | 
                    # a search_key, but the heap is using search_keys
 | 
|
297  | 
search_key = node._search_key_func(key)  | 
|
298  | 
heapq.heappush(pending, (search_key, key, value, path))  | 
|
299  | 
else:  | 
|
300  | 
                # type(node) == InternalNode
 | 
|
301  | 
path = (node._key, path)  | 
|
302  | 
for prefix, child in node._items.items():  | 
|
303  | 
heapq.heappush(pending, (prefix, None, child, path))  | 
|
304  | 
def process_common_internal_nodes(self_node, basis_node):  | 
|
305  | 
self_items = set(self_node._items.items())  | 
|
306  | 
basis_items = set(basis_node._items.items())  | 
|
307  | 
path = (self_node._key, None)  | 
|
308  | 
for prefix, child in self_items - basis_items:  | 
|
309  | 
heapq.heappush(self_pending, (prefix, None, child, path))  | 
|
310  | 
path = (basis_node._key, None)  | 
|
311  | 
for prefix, child in basis_items - self_items:  | 
|
312  | 
heapq.heappush(basis_pending, (prefix, None, child, path))  | 
|
313  | 
def process_common_leaf_nodes(self_node, basis_node):  | 
|
314  | 
self_items = set(self_node._items.items())  | 
|
315  | 
basis_items = set(basis_node._items.items())  | 
|
316  | 
path = (self_node._key, None)  | 
|
317  | 
for key, value in self_items - basis_items:  | 
|
318  | 
prefix = self._search_key_func(key)  | 
|
319  | 
heapq.heappush(self_pending, (prefix, key, value, path))  | 
|
320  | 
path = (basis_node._key, None)  | 
|
321  | 
for key, value in basis_items - self_items:  | 
|
322  | 
prefix = basis._search_key_func(key)  | 
|
323  | 
heapq.heappush(basis_pending, (prefix, key, value, path))  | 
|
324  | 
def process_common_prefix_nodes(self_node, self_path,  | 
|
325  | 
basis_node, basis_path):  | 
|
326  | 
            # Would it be more efficient if we could request both at the same
 | 
|
327  | 
            # time?
 | 
|
328  | 
self_node = self._get_node(self_node)  | 
|
329  | 
basis_node = basis._get_node(basis_node)  | 
|
330  | 
if (type(self_node) == InternalNode  | 
|
331  | 
and type(basis_node) == InternalNode):  | 
|
332  | 
                # Matching internal nodes
 | 
|
333  | 
process_common_internal_nodes(self_node, basis_node)  | 
|
334  | 
elif (type(self_node) == LeafNode  | 
|
335  | 
and type(basis_node) == LeafNode):  | 
|
336  | 
process_common_leaf_nodes(self_node, basis_node)  | 
|
337  | 
else:  | 
|
338  | 
process_node(self_node, self_path, self, self_pending)  | 
|
339  | 
process_node(basis_node, basis_path, basis, basis_pending)  | 
|
340  | 
process_common_prefix_nodes(self_node, None, basis_node, None)  | 
|
341  | 
self_seen = set()  | 
|
342  | 
basis_seen = set()  | 
|
343  | 
excluded_keys = set()  | 
|
344  | 
def check_excluded(key_path):  | 
|
345  | 
            # Note that this is N^2, it depends on us trimming trees
 | 
|
346  | 
            # aggressively to not become slow.
 | 
|
347  | 
            # A better implementation would probably have a reverse map
 | 
|
348  | 
            # back to the children of a node, and jump straight to it when
 | 
|
349  | 
            # a common node is detected, the proceed to remove the already
 | 
|
350  | 
            # pending children. bzrlib.graph has a searcher module with a
 | 
|
351  | 
            # similar problem.
 | 
|
352  | 
while key_path is not None:  | 
|
353  | 
key, key_path = key_path  | 
|
354  | 
if key in excluded_keys:  | 
|
355  | 
return True  | 
|
356  | 
return False  | 
|
357  | 
||
358  | 
loop_counter = 0  | 
|
359  | 
while self_pending or basis_pending:  | 
|
360  | 
loop_counter += 1  | 
|
361  | 
if not self_pending:  | 
|
362  | 
                # self is exhausted: output remainder of basis
 | 
|
363  | 
for prefix, key, node, path in basis_pending:  | 
|
364  | 
if check_excluded(path):  | 
|
365  | 
                        continue
 | 
|
366  | 
node = basis._get_node(node)  | 
|
367  | 
if key is not None:  | 
|
368  | 
                        # a value
 | 
|
369  | 
yield (key, node, None)  | 
|
370  | 
else:  | 
|
371  | 
                        # subtree - fastpath the entire thing.
 | 
|
372  | 
for key, value in node.iteritems(basis._store):  | 
|
373  | 
yield (key, value, None)  | 
|
374  | 
                return
 | 
|
375  | 
elif not basis_pending:  | 
|
376  | 
                # basis is exhausted: output remainder of self.
 | 
|
377  | 
for prefix, key, node, path in self_pending:  | 
|
378  | 
if check_excluded(path):  | 
|
379  | 
                        continue
 | 
|
380  | 
node = self._get_node(node)  | 
|
381  | 
if key is not None:  | 
|
382  | 
                        # a value
 | 
|
383  | 
yield (key, None, node)  | 
|
384  | 
else:  | 
|
385  | 
                        # subtree - fastpath the entire thing.
 | 
|
386  | 
for key, value in node.iteritems(self._store):  | 
|
387  | 
yield (key, None, value)  | 
|
388  | 
                return
 | 
|
389  | 
else:  | 
|
390  | 
                # XXX: future optimisation - yield the smaller items
 | 
|
391  | 
                # immediately rather than pushing everything on/off the
 | 
|
392  | 
                # heaps. Applies to both internal nodes and leafnodes.
 | 
|
393  | 
if self_pending[0][0] < basis_pending[0][0]:  | 
|
394  | 
                    # expand self
 | 
|
395  | 
prefix, key, node, path = heapq.heappop(self_pending)  | 
|
396  | 
if check_excluded(path):  | 
|
397  | 
                        continue
 | 
|
398  | 
if key is not None:  | 
|
399  | 
                        # a value
 | 
|
400  | 
yield (key, None, node)  | 
|
401  | 
else:  | 
|
402  | 
process_node(node, path, self, self_pending)  | 
|
403  | 
                        continue
 | 
|
404  | 
elif self_pending[0][0] > basis_pending[0][0]:  | 
|
405  | 
                    # expand basis
 | 
|
406  | 
prefix, key, node, path = heapq.heappop(basis_pending)  | 
|
407  | 
if check_excluded(path):  | 
|
408  | 
                        continue
 | 
|
409  | 
if key is not None:  | 
|
410  | 
                        # a value
 | 
|
411  | 
yield (key, node, None)  | 
|
412  | 
else:  | 
|
413  | 
process_node(node, path, basis, basis_pending)  | 
|
414  | 
                        continue
 | 
|
415  | 
else:  | 
|
416  | 
                    # common prefix: possibly expand both
 | 
|
417  | 
if self_pending[0][1] is None:  | 
|
418  | 
                        # process next self
 | 
|
419  | 
read_self = True  | 
|
420  | 
else:  | 
|
421  | 
read_self = False  | 
|
422  | 
if basis_pending[0][1] is None:  | 
|
423  | 
                        # process next basis
 | 
|
424  | 
read_basis = True  | 
|
425  | 
else:  | 
|
426  | 
read_basis = False  | 
|
427  | 
if not read_self and not read_basis:  | 
|
428  | 
                        # compare a common value
 | 
|
429  | 
self_details = heapq.heappop(self_pending)  | 
|
430  | 
basis_details = heapq.heappop(basis_pending)  | 
|
431  | 
if self_details[2] != basis_details[2]:  | 
|
432  | 
yield (self_details[1],  | 
|
433  | 
basis_details[2], self_details[2])  | 
|
434  | 
                        continue
 | 
|
435  | 
                    # At least one side wasn't a simple value
 | 
|
436  | 
if (self._node_key(self_pending[0][2]) ==  | 
|
437  | 
self._node_key(basis_pending[0][2])):  | 
|
438  | 
                        # Identical pointers, skip (and don't bother adding to
 | 
|
439  | 
                        # excluded, it won't turn up again.
 | 
|
440  | 
heapq.heappop(self_pending)  | 
|
441  | 
heapq.heappop(basis_pending)  | 
|
442  | 
                        continue
 | 
|
443  | 
                    # Now we need to expand this node before we can continue
 | 
|
444  | 
if read_self and read_basis:  | 
|
445  | 
                        # Both sides start with the same prefix, so process
 | 
|
446  | 
                        # them in parallel
 | 
|
447  | 
self_prefix, _, self_node, self_path = heapq.heappop(  | 
|
448  | 
self_pending)  | 
|
449  | 
basis_prefix, _, basis_node, basis_path = heapq.heappop(  | 
|
450  | 
basis_pending)  | 
|
451  | 
if self_prefix != basis_prefix:  | 
|
452  | 
raise AssertionError(  | 
|
453  | 
'%r != %r' % (self_prefix, basis_prefix))  | 
|
454  | 
process_common_prefix_nodes(  | 
|
455  | 
self_node, self_path,  | 
|
456  | 
basis_node, basis_path)  | 
|
457  | 
                        continue
 | 
|
458  | 
if read_self:  | 
|
459  | 
prefix, key, node, path = heapq.heappop(self_pending)  | 
|
460  | 
if check_excluded(path):  | 
|
461  | 
                            continue
 | 
|
462  | 
process_node(node, path, self, self_pending)  | 
|
463  | 
if read_basis:  | 
|
464  | 
prefix, key, node, path = heapq.heappop(basis_pending)  | 
|
465  | 
if check_excluded(path):  | 
|
466  | 
                            continue
 | 
|
467  | 
process_node(node, path, basis, basis_pending)  | 
|
468  | 
        # print loop_counter
 | 
|
469  | 
||
470  | 
def iteritems(self, key_filter=None):  | 
|
471  | 
"""Iterate over the entire CHKMap's contents."""  | 
|
472  | 
self._ensure_root()  | 
|
473  | 
return self._root_node.iteritems(self._store, key_filter=key_filter)  | 
|
474  | 
||
475  | 
def key(self):  | 
|
476  | 
"""Return the key for this map."""  | 
|
477  | 
if type(self._root_node) is tuple:  | 
|
478  | 
return self._root_node  | 
|
479  | 
else:  | 
|
480  | 
return self._root_node._key  | 
|
481  | 
||
482  | 
def __len__(self):  | 
|
483  | 
self._ensure_root()  | 
|
484  | 
return len(self._root_node)  | 
|
485  | 
||
486  | 
def map(self, key, value):  | 
|
487  | 
"""Map a key tuple to value."""  | 
|
488  | 
        # Need a root object.
 | 
|
489  | 
self._ensure_root()  | 
|
490  | 
prefix, node_details = self._root_node.map(self._store, key, value)  | 
|
491  | 
if len(node_details) == 1:  | 
|
492  | 
self._root_node = node_details[0][1]  | 
|
493  | 
else:  | 
|
494  | 
self._root_node = InternalNode(prefix,  | 
|
495  | 
search_key_func=self._search_key_func)  | 
|
496  | 
self._root_node.set_maximum_size(node_details[0][1].maximum_size)  | 
|
497  | 
self._root_node._key_width = node_details[0][1]._key_width  | 
|
498  | 
for split, node in node_details:  | 
|
499  | 
self._root_node.add_node(split, node)  | 
|
500  | 
||
501  | 
def _node_key(self, node):  | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
502  | 
"""Get the key for a node whether it's a tuple or node."""  | 
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
503  | 
if type(node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
504  | 
return node  | 
505  | 
else:  | 
|
506  | 
return node._key  | 
|
507  | 
||
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
508  | 
def unmap(self, key, check_remap=True):  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
509  | 
"""remove key from the map."""  | 
510  | 
self._ensure_root()  | 
|
511  | 
if type(self._root_node) is InternalNode:  | 
|
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
512  | 
unmapped = self._root_node.unmap(self._store, key,  | 
513  | 
check_remap=check_remap)  | 
|
514  | 
else:  | 
|
515  | 
unmapped = self._root_node.unmap(self._store, key)  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
516  | 
self._root_node = unmapped  | 
517  | 
||
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
518  | 
def _check_remap(self):  | 
519  | 
"""Check if nodes can be collapsed."""  | 
|
520  | 
self._ensure_root()  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
521  | 
if type(self._root_node) is InternalNode:  | 
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
522  | 
self._root_node._check_remap(self._store)  | 
523  | 
||
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
524  | 
def _save(self):  | 
525  | 
"""Save the map completely.  | 
|
526  | 
||
527  | 
        :return: The key of the root node.
 | 
|
528  | 
        """
 | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
529  | 
if type(self._root_node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
530  | 
            # Already saved.
 | 
531  | 
return self._root_node  | 
|
532  | 
keys = list(self._root_node.serialise(self._store))  | 
|
533  | 
return keys[-1]  | 
|
534  | 
||
535  | 
||
536  | 
class Node(object):  | 
|
537  | 
"""Base class defining the protocol for CHK Map nodes.  | 
|
538  | 
||
539  | 
    :ivar _raw_size: The total size of the serialized key:value data, before
 | 
|
540  | 
        adding the header bytes, and without prefix compression.
 | 
|
541  | 
    """
 | 
|
542  | 
||
543  | 
def __init__(self, key_width=1):  | 
|
544  | 
"""Create a node.  | 
|
545  | 
||
546  | 
        :param key_width: The width of keys for this node.
 | 
|
547  | 
        """
 | 
|
548  | 
self._key = None  | 
|
549  | 
        # Current number of elements
 | 
|
550  | 
self._len = 0  | 
|
551  | 
self._maximum_size = 0  | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
552  | 
self._key_width = key_width  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
553  | 
        # current size in bytes
 | 
554  | 
self._raw_size = 0  | 
|
555  | 
        # The pointers/values this node has - meaning defined by child classes.
 | 
|
556  | 
self._items = {}  | 
|
557  | 
        # The common search prefix
 | 
|
558  | 
self._search_prefix = None  | 
|
559  | 
||
560  | 
def __repr__(self):  | 
|
561  | 
items_str = str(sorted(self._items))  | 
|
562  | 
if len(items_str) > 20:  | 
|
| 
3735.2.154
by Ian Clatworthy
 fix chk_map Node %r formatting  | 
563  | 
items_str = items_str[:16] + '...]'  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
564  | 
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % (  | 
565  | 
self.__class__.__name__, self._key, self._len, self._raw_size,  | 
|
566  | 
self._maximum_size, self._search_prefix, items_str)  | 
|
567  | 
||
568  | 
def key(self):  | 
|
569  | 
return self._key  | 
|
570  | 
||
571  | 
def __len__(self):  | 
|
572  | 
return self._len  | 
|
573  | 
||
574  | 
    @property
 | 
|
575  | 
def maximum_size(self):  | 
|
576  | 
"""What is the upper limit for adding references to a node."""  | 
|
577  | 
return self._maximum_size  | 
|
578  | 
||
579  | 
def set_maximum_size(self, new_size):  | 
|
580  | 
"""Set the size threshold for nodes.  | 
|
581  | 
||
582  | 
        :param new_size: The size at which no data is added to a node. 0 for
 | 
|
583  | 
            unlimited.
 | 
|
584  | 
        """
 | 
|
585  | 
self._maximum_size = new_size  | 
|
586  | 
||
587  | 
    @classmethod
 | 
|
588  | 
def common_prefix(cls, prefix, key):  | 
|
589  | 
"""Given 2 strings, return the longest prefix common to both.  | 
|
590  | 
||
591  | 
        :param prefix: This has been the common prefix for other keys, so it is
 | 
|
592  | 
            more likely to be the common prefix in this case as well.
 | 
|
593  | 
        :param key: Another string to compare to
 | 
|
594  | 
        """
 | 
|
595  | 
if key.startswith(prefix):  | 
|
596  | 
return prefix  | 
|
| 
4358.1.1
by Jelmer Vernooij
 Support empty keys when looking for common prefixes in CHKMap.  | 
597  | 
pos = -1  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
598  | 
        # Is there a better way to do this?
 | 
599  | 
for pos, (left, right) in enumerate(zip(prefix, key)):  | 
|
600  | 
if left != right:  | 
|
601  | 
pos -= 1  | 
|
602  | 
                break
 | 
|
603  | 
common = prefix[:pos+1]  | 
|
604  | 
return common  | 
|
605  | 
||
606  | 
    @classmethod
 | 
|
607  | 
def common_prefix_for_keys(cls, keys):  | 
|
608  | 
"""Given a list of keys, find their common prefix.  | 
|
609  | 
||
610  | 
        :param keys: An iterable of strings.
 | 
|
611  | 
        :return: The longest common prefix of all keys.
 | 
|
612  | 
        """
 | 
|
613  | 
common_prefix = None  | 
|
614  | 
for key in keys:  | 
|
615  | 
if common_prefix is None:  | 
|
616  | 
common_prefix = key  | 
|
617  | 
                continue
 | 
|
618  | 
common_prefix = cls.common_prefix(common_prefix, key)  | 
|
619  | 
if not common_prefix:  | 
|
620  | 
                # if common_prefix is the empty string, then we know it won't
 | 
|
621  | 
                # change further
 | 
|
622  | 
return ''  | 
|
623  | 
return common_prefix  | 
|
624  | 
||
625  | 
||
626  | 
# Singleton indicating we have not computed _search_prefix yet
 | 
|
627  | 
_unknown = object()  | 
|
628  | 
||
629  | 
class LeafNode(Node):  | 
|
630  | 
"""A node containing actual key:value pairs.  | 
|
631  | 
||
632  | 
    :ivar _items: A dict of key->value items. The key is in tuple form.
 | 
|
633  | 
    :ivar _size: The number of bytes that would be used by serializing all of
 | 
|
634  | 
        the key/value pairs.
 | 
|
635  | 
    """
 | 
|
636  | 
||
637  | 
def __init__(self, search_key_func=None):  | 
|
638  | 
Node.__init__(self)  | 
|
639  | 
        # All of the keys in this leaf node share this common prefix
 | 
|
640  | 
self._common_serialised_prefix = None  | 
|
641  | 
self._serialise_key = '\x00'.join  | 
|
642  | 
if search_key_func is None:  | 
|
643  | 
self._search_key_func = _search_key_plain  | 
|
644  | 
else:  | 
|
645  | 
self._search_key_func = search_key_func  | 
|
646  | 
||
647  | 
def __repr__(self):  | 
|
| 
3735.2.154
by Ian Clatworthy
 fix chk_map Node %r formatting  | 
648  | 
items_str = str(sorted(self._items))  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
649  | 
if len(items_str) > 20:  | 
| 
3735.2.154
by Ian Clatworthy
 fix chk_map Node %r formatting  | 
650  | 
items_str = items_str[:16] + '...]'  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
651  | 
        return \
 | 
652  | 
'%s(key:%s len:%s size:%s max:%s prefix:%s keywidth:%s items:%s)' \  | 
|
653  | 
% (self.__class__.__name__, self._key, self._len, self._raw_size,  | 
|
654  | 
self._maximum_size, self._search_prefix, self._key_width, items_str)  | 
|
655  | 
||
656  | 
def _current_size(self):  | 
|
657  | 
"""Answer the current serialised size of this node.  | 
|
658  | 
||
659  | 
        This differs from self._raw_size in that it includes the bytes used for
 | 
|
660  | 
        the header.
 | 
|
661  | 
        """
 | 
|
662  | 
if self._common_serialised_prefix is None:  | 
|
663  | 
bytes_for_items = 0  | 
|
664  | 
prefix_len = 0  | 
|
665  | 
else:  | 
|
666  | 
            # We will store a single string with the common prefix
 | 
|
667  | 
            # And then that common prefix will not be stored in any of the
 | 
|
668  | 
            # entry lines
 | 
|
669  | 
prefix_len = len(self._common_serialised_prefix)  | 
|
670  | 
bytes_for_items = (self._raw_size - (prefix_len * self._len))  | 
|
671  | 
return (9 # 'chkleaf:\n'  | 
|
672  | 
+ len(str(self._maximum_size)) + 1  | 
|
673  | 
+ len(str(self._key_width)) + 1  | 
|
674  | 
+ len(str(self._len)) + 1  | 
|
675  | 
+ prefix_len + 1  | 
|
676  | 
+ bytes_for_items)  | 
|
677  | 
||
678  | 
    @classmethod
 | 
|
679  | 
def deserialise(klass, bytes, key, search_key_func=None):  | 
|
680  | 
"""Deserialise bytes, with key key, into a LeafNode.  | 
|
681  | 
||
682  | 
        :param bytes: The bytes of the node.
 | 
|
683  | 
        :param key: The key that the serialised node has.
 | 
|
684  | 
        """
 | 
|
685  | 
return _deserialise_leaf_node(bytes, key,  | 
|
686  | 
search_key_func=search_key_func)  | 
|
687  | 
||
688  | 
def iteritems(self, store, key_filter=None):  | 
|
689  | 
"""Iterate over items in the node.  | 
|
690  | 
||
691  | 
        :param key_filter: A filter to apply to the node. It should be a
 | 
|
692  | 
            list/set/dict or similar repeatedly iterable container.
 | 
|
693  | 
        """
 | 
|
694  | 
if key_filter is not None:  | 
|
695  | 
            # Adjust the filter - short elements go to a prefix filter. All
 | 
|
696  | 
            # other items are looked up directly.
 | 
|
697  | 
            # XXX: perhaps defaultdict? Profiling<rinse and repeat>
 | 
|
698  | 
filters = {}  | 
|
699  | 
for key in key_filter:  | 
|
700  | 
if len(key) == self._key_width:  | 
|
701  | 
                    # This filter is meant to match exactly one key, yield it
 | 
|
702  | 
                    # if we have it.
 | 
|
703  | 
try:  | 
|
704  | 
yield key, self._items[key]  | 
|
705  | 
except KeyError:  | 
|
706  | 
                        # This key is not present in this map, continue
 | 
|
707  | 
                        pass
 | 
|
708  | 
else:  | 
|
709  | 
                    # Short items, we need to match based on a prefix
 | 
|
710  | 
length_filter = filters.setdefault(len(key), set())  | 
|
711  | 
length_filter.add(key)  | 
|
712  | 
if filters:  | 
|
713  | 
filters = filters.items()  | 
|
714  | 
for item in self._items.iteritems():  | 
|
715  | 
for length, length_filter in filters:  | 
|
716  | 
if item[0][:length] in length_filter:  | 
|
717  | 
yield item  | 
|
718  | 
                            break
 | 
|
719  | 
else:  | 
|
720  | 
for item in self._items.iteritems():  | 
|
721  | 
yield item  | 
|
722  | 
||
723  | 
def _key_value_len(self, key, value):  | 
|
724  | 
        # TODO: Should probably be done without actually joining the key, but
 | 
|
725  | 
        #       then that can be done via the C extension
 | 
|
726  | 
return (len(self._serialise_key(key)) + 1  | 
|
727  | 
+ len(str(value.count('\n'))) + 1  | 
|
728  | 
+ len(value) + 1)  | 
|
729  | 
||
730  | 
def _search_key(self, key):  | 
|
731  | 
return self._search_key_func(key)  | 
|
732  | 
||
733  | 
def _map_no_split(self, key, value):  | 
|
734  | 
"""Map a key to a value.  | 
|
735  | 
||
736  | 
        This assumes either the key does not already exist, or you have already
 | 
|
737  | 
        removed its size and length from self.
 | 
|
738  | 
||
739  | 
        :return: True if adding this node should cause us to split.
 | 
|
740  | 
        """
 | 
|
741  | 
self._items[key] = value  | 
|
742  | 
self._raw_size += self._key_value_len(key, value)  | 
|
743  | 
self._len += 1  | 
|
744  | 
serialised_key = self._serialise_key(key)  | 
|
745  | 
if self._common_serialised_prefix is None:  | 
|
746  | 
self._common_serialised_prefix = serialised_key  | 
|
747  | 
else:  | 
|
748  | 
self._common_serialised_prefix = self.common_prefix(  | 
|
749  | 
self._common_serialised_prefix, serialised_key)  | 
|
750  | 
search_key = self._search_key(key)  | 
|
751  | 
if self._search_prefix is _unknown:  | 
|
752  | 
self._compute_search_prefix()  | 
|
753  | 
if self._search_prefix is None:  | 
|
754  | 
self._search_prefix = search_key  | 
|
755  | 
else:  | 
|
756  | 
self._search_prefix = self.common_prefix(  | 
|
757  | 
self._search_prefix, search_key)  | 
|
758  | 
if (self._len > 1  | 
|
759  | 
and self._maximum_size  | 
|
760  | 
and self._current_size() > self._maximum_size):  | 
|
761  | 
            # Check to see if all of the search_keys for this node are
 | 
|
762  | 
            # identical. We allow the node to grow under that circumstance
 | 
|
763  | 
            # (we could track this as common state, but it is infrequent)
 | 
|
764  | 
if (search_key != self._search_prefix  | 
|
765  | 
or not self._are_search_keys_identical()):  | 
|
766  | 
return True  | 
|
767  | 
return False  | 
|
768  | 
||
769  | 
def _split(self, store):  | 
|
770  | 
"""We have overflowed.  | 
|
771  | 
||
772  | 
        Split this node into multiple LeafNodes, return it up the stack so that
 | 
|
773  | 
        the next layer creates a new InternalNode and references the new nodes.
 | 
|
774  | 
||
775  | 
        :return: (common_serialised_prefix, [(node_serialised_prefix, node)])
 | 
|
776  | 
        """
 | 
|
777  | 
if self._search_prefix is _unknown:  | 
|
778  | 
raise AssertionError('Search prefix must be known')  | 
|
779  | 
common_prefix = self._search_prefix  | 
|
780  | 
split_at = len(common_prefix) + 1  | 
|
781  | 
result = {}  | 
|
782  | 
for key, value in self._items.iteritems():  | 
|
783  | 
search_key = self._search_key(key)  | 
|
784  | 
prefix = search_key[:split_at]  | 
|
785  | 
            # TODO: Generally only 1 key can be exactly the right length,
 | 
|
786  | 
            #       which means we can only have 1 key in the node pointed
 | 
|
787  | 
            #       at by the 'prefix\0' key. We might want to consider
 | 
|
788  | 
            #       folding it into the containing InternalNode rather than
 | 
|
789  | 
            #       having a fixed length-1 node.
 | 
|
790  | 
            #       Note this is probably not true for hash keys, as they
 | 
|
791  | 
            #       may get a '\00' node anywhere, but won't have keys of
 | 
|
792  | 
            #       different lengths.
 | 
|
793  | 
if len(prefix) < split_at:  | 
|
794  | 
prefix += '\x00'*(split_at - len(prefix))  | 
|
795  | 
if prefix not in result:  | 
|
796  | 
node = LeafNode(search_key_func=self._search_key_func)  | 
|
797  | 
node.set_maximum_size(self._maximum_size)  | 
|
798  | 
node._key_width = self._key_width  | 
|
799  | 
result[prefix] = node  | 
|
800  | 
else:  | 
|
801  | 
node = result[prefix]  | 
|
| 
4413.5.4
by John Arbash Meinel
 Change CHKMap.from_dict to create a LeafNode and split it.  | 
802  | 
sub_prefix, node_details = node.map(store, key, value)  | 
803  | 
if len(node_details) > 1:  | 
|
804  | 
if prefix != sub_prefix:  | 
|
805  | 
                    # This node has been split and is now found via a different
 | 
|
806  | 
                    # path
 | 
|
807  | 
result.pop(prefix)  | 
|
808  | 
new_node = InternalNode(sub_prefix,  | 
|
809  | 
search_key_func=self._search_key_func)  | 
|
810  | 
new_node.set_maximum_size(self._maximum_size)  | 
|
811  | 
new_node._key_width = self._key_width  | 
|
812  | 
for split, node in node_details:  | 
|
813  | 
new_node.add_node(split, node)  | 
|
814  | 
result[prefix] = new_node  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
815  | 
return common_prefix, result.items()  | 
816  | 
||
817  | 
def map(self, store, key, value):  | 
|
818  | 
"""Map key to value."""  | 
|
819  | 
if key in self._items:  | 
|
820  | 
self._raw_size -= self._key_value_len(key, self._items[key])  | 
|
821  | 
self._len -= 1  | 
|
822  | 
self._key = None  | 
|
823  | 
if self._map_no_split(key, value):  | 
|
824  | 
return self._split(store)  | 
|
825  | 
else:  | 
|
826  | 
if self._search_prefix is _unknown:  | 
|
827  | 
raise AssertionError('%r must be known' % self._search_prefix)  | 
|
828  | 
return self._search_prefix, [("", self)]  | 
|
829  | 
||
830  | 
def serialise(self, store):  | 
|
831  | 
"""Serialise the LeafNode to store.  | 
|
832  | 
||
833  | 
        :param store: A VersionedFiles honouring the CHK extensions.
 | 
|
834  | 
        :return: An iterable of the keys inserted by this operation.
 | 
|
835  | 
        """
 | 
|
836  | 
lines = ["chkleaf:\n"]  | 
|
837  | 
lines.append("%d\n" % self._maximum_size)  | 
|
838  | 
lines.append("%d\n" % self._key_width)  | 
|
839  | 
lines.append("%d\n" % self._len)  | 
|
840  | 
if self._common_serialised_prefix is None:  | 
|
841  | 
lines.append('\n')  | 
|
842  | 
if len(self._items) != 0:  | 
|
843  | 
raise AssertionError('If _common_serialised_prefix is None'  | 
|
844  | 
' we should have no items')  | 
|
845  | 
else:  | 
|
846  | 
lines.append('%s\n' % (self._common_serialised_prefix,))  | 
|
847  | 
prefix_len = len(self._common_serialised_prefix)  | 
|
848  | 
for key, value in sorted(self._items.items()):  | 
|
849  | 
            # Always add a final newline
 | 
|
850  | 
value_lines = osutils.chunks_to_lines([value + '\n'])  | 
|
851  | 
serialized = "%s\x00%s\n" % (self._serialise_key(key),  | 
|
852  | 
len(value_lines))  | 
|
853  | 
if not serialized.startswith(self._common_serialised_prefix):  | 
|
854  | 
raise AssertionError('We thought the common prefix was %r'  | 
|
855  | 
' but entry %r does not have it in common'  | 
|
856  | 
% (self._common_serialised_prefix, serialized))  | 
|
857  | 
lines.append(serialized[prefix_len:])  | 
|
858  | 
lines.extend(value_lines)  | 
|
859  | 
sha1, _, _ = store.add_lines((None,), (), lines)  | 
|
860  | 
self._key = ("sha1:" + sha1,)  | 
|
861  | 
bytes = ''.join(lines)  | 
|
862  | 
if len(bytes) != self._current_size():  | 
|
863  | 
raise AssertionError('Invalid _current_size')  | 
|
864  | 
_page_cache.add(self._key, bytes)  | 
|
865  | 
return [self._key]  | 
|
866  | 
||
867  | 
def refs(self):  | 
|
868  | 
"""Return the references to other CHK's held by this node."""  | 
|
869  | 
return []  | 
|
870  | 
||
871  | 
def _compute_search_prefix(self):  | 
|
872  | 
"""Determine the common search prefix for all keys in this node.  | 
|
873  | 
||
874  | 
        :return: A bytestring of the longest search key prefix that is
 | 
|
875  | 
            unique within this node.
 | 
|
876  | 
        """
 | 
|
877  | 
search_keys = [self._search_key_func(key) for key in self._items]  | 
|
878  | 
self._search_prefix = self.common_prefix_for_keys(search_keys)  | 
|
879  | 
return self._search_prefix  | 
|
880  | 
||
881  | 
def _are_search_keys_identical(self):  | 
|
882  | 
"""Check to see if the search keys for all entries are the same.  | 
|
883  | 
||
884  | 
        When using a hash as the search_key it is possible for non-identical
 | 
|
885  | 
        keys to collide. If that happens enough, we may try overflow a
 | 
|
886  | 
        LeafNode, but as all are collisions, we must not split.
 | 
|
887  | 
        """
 | 
|
888  | 
common_search_key = None  | 
|
889  | 
for key in self._items:  | 
|
890  | 
search_key = self._search_key(key)  | 
|
891  | 
if common_search_key is None:  | 
|
892  | 
common_search_key = search_key  | 
|
893  | 
elif search_key != common_search_key:  | 
|
894  | 
return False  | 
|
895  | 
return True  | 
|
896  | 
||
897  | 
def _compute_serialised_prefix(self):  | 
|
898  | 
"""Determine the common prefix for serialised keys in this node.  | 
|
899  | 
||
900  | 
        :return: A bytestring of the longest serialised key prefix that is
 | 
|
901  | 
            unique within this node.
 | 
|
902  | 
        """
 | 
|
903  | 
serialised_keys = [self._serialise_key(key) for key in self._items]  | 
|
904  | 
self._common_serialised_prefix = self.common_prefix_for_keys(  | 
|
905  | 
serialised_keys)  | 
|
| 
3735.19.1
by Ian Clatworthy
 CHKMap cleanups  | 
906  | 
return self._common_serialised_prefix  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
907  | 
|
908  | 
def unmap(self, store, key):  | 
|
909  | 
"""Unmap key from the node."""  | 
|
910  | 
try:  | 
|
911  | 
self._raw_size -= self._key_value_len(key, self._items[key])  | 
|
912  | 
except KeyError:  | 
|
913  | 
trace.mutter("key %s not found in %r", key, self._items)  | 
|
914  | 
            raise
 | 
|
915  | 
self._len -= 1  | 
|
916  | 
del self._items[key]  | 
|
917  | 
self._key = None  | 
|
918  | 
        # Recompute from scratch
 | 
|
919  | 
self._compute_search_prefix()  | 
|
920  | 
self._compute_serialised_prefix()  | 
|
921  | 
return self  | 
|
922  | 
||
923  | 
||
924  | 
class InternalNode(Node):  | 
|
925  | 
"""A node that contains references to other nodes.  | 
|
926  | 
||
927  | 
    An InternalNode is responsible for mapping search key prefixes to child
 | 
|
928  | 
    nodes.
 | 
|
929  | 
||
930  | 
    :ivar _items: serialised_key => node dictionary. node may be a tuple,
 | 
|
931  | 
        LeafNode or InternalNode.
 | 
|
932  | 
    """
 | 
|
933  | 
||
934  | 
def __init__(self, prefix='', search_key_func=None):  | 
|
935  | 
Node.__init__(self)  | 
|
936  | 
        # The size of an internalnode with default values and no children.
 | 
|
937  | 
        # How many octets key prefixes within this node are.
 | 
|
938  | 
self._node_width = 0  | 
|
939  | 
self._search_prefix = prefix  | 
|
940  | 
if search_key_func is None:  | 
|
941  | 
self._search_key_func = _search_key_plain  | 
|
942  | 
else:  | 
|
943  | 
self._search_key_func = search_key_func  | 
|
944  | 
||
945  | 
def add_node(self, prefix, node):  | 
|
946  | 
"""Add a child node with prefix prefix, and node node.  | 
|
947  | 
||
948  | 
        :param prefix: The search key prefix for node.
 | 
|
949  | 
        :param node: The node being added.
 | 
|
950  | 
        """
 | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
951  | 
if self._search_prefix is None:  | 
952  | 
raise AssertionError("_search_prefix should not be None")  | 
|
953  | 
if not prefix.startswith(self._search_prefix):  | 
|
954  | 
raise AssertionError("prefixes mismatch: %s must start with %s"  | 
|
955  | 
% (prefix,self._search_prefix))  | 
|
956  | 
if len(prefix) != len(self._search_prefix) + 1:  | 
|
957  | 
raise AssertionError("prefix wrong length: len(%s) is not %d" %  | 
|
958  | 
(prefix, len(self._search_prefix) + 1))  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
959  | 
self._len += len(node)  | 
960  | 
if not len(self._items):  | 
|
961  | 
self._node_width = len(prefix)  | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
962  | 
if self._node_width != len(self._search_prefix) + 1:  | 
963  | 
raise AssertionError("node width mismatch: %d is not %d" %  | 
|
964  | 
(self._node_width, len(self._search_prefix) + 1))  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
965  | 
self._items[prefix] = node  | 
966  | 
self._key = None  | 
|
967  | 
||
968  | 
def _current_size(self):  | 
|
969  | 
"""Answer the current serialised size of this node."""  | 
|
970  | 
return (self._raw_size + len(str(self._len)) + len(str(self._key_width)) +  | 
|
971  | 
len(str(self._maximum_size)))  | 
|
972  | 
||
973  | 
    @classmethod
 | 
|
974  | 
def deserialise(klass, bytes, key, search_key_func=None):  | 
|
975  | 
"""Deserialise bytes to an InternalNode, with key key.  | 
|
976  | 
||
977  | 
        :param bytes: The bytes of the node.
 | 
|
978  | 
        :param key: The key that the serialised node has.
 | 
|
979  | 
        :return: An InternalNode instance.
 | 
|
980  | 
        """
 | 
|
981  | 
return _deserialise_internal_node(bytes, key,  | 
|
982  | 
search_key_func=search_key_func)  | 
|
983  | 
||
984  | 
def iteritems(self, store, key_filter=None):  | 
|
985  | 
for node, node_filter in self._iter_nodes(store, key_filter=key_filter):  | 
|
986  | 
for item in node.iteritems(store, key_filter=node_filter):  | 
|
987  | 
yield item  | 
|
988  | 
||
989  | 
def _iter_nodes(self, store, key_filter=None, batch_size=None):  | 
|
990  | 
"""Iterate over node objects which match key_filter.  | 
|
991  | 
||
992  | 
        :param store: A store to use for accessing content.
 | 
|
993  | 
        :param key_filter: A key filter to filter nodes. Only nodes that might
 | 
|
994  | 
            contain a key in key_filter will be returned.
 | 
|
995  | 
        :param batch_size: If not None, then we will return the nodes that had
 | 
|
996  | 
            to be read using get_record_stream in batches, rather than reading
 | 
|
997  | 
            them all at once.
 | 
|
998  | 
        :return: An iterable of nodes. This function does not have to be fully
 | 
|
999  | 
            consumed.  (There will be no pending I/O when items are being returned.)
 | 
|
1000  | 
        """
 | 
|
1001  | 
        # Map from chk key ('sha1:...',) to (prefix, key_filter)
 | 
|
1002  | 
        # prefix is the key in self._items to use, key_filter is the key_filter
 | 
|
1003  | 
        # entries that would match this node
 | 
|
1004  | 
keys = {}  | 
|
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1005  | 
shortcut = False  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1006  | 
if key_filter is None:  | 
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1007  | 
            # yielding all nodes, yield whatever we have, and queue up a read
 | 
1008  | 
            # for whatever we are missing
 | 
|
1009  | 
shortcut = True  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1010  | 
for prefix, node in self._items.iteritems():  | 
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1011  | 
if node.__class__ is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1012  | 
keys[node] = (prefix, None)  | 
1013  | 
else:  | 
|
1014  | 
yield node, None  | 
|
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1015  | 
elif len(key_filter) == 1:  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1016  | 
            # Technically, this path could also be handled by the first check
 | 
1017  | 
            # in 'self._node_width' in length_filters. However, we can handle
 | 
|
1018  | 
            # this case without spending any time building up the
 | 
|
1019  | 
            # prefix_to_keys, etc state.
 | 
|
1020  | 
||
1021  | 
            # This is a bit ugly, but TIMEIT showed it to be by far the fastest
 | 
|
1022  | 
            # 0.626us   list(key_filter)[0]
 | 
|
1023  | 
            #       is a func() for list(), 2 mallocs, and a getitem
 | 
|
1024  | 
            # 0.489us   [k for k in key_filter][0]
 | 
|
1025  | 
            #       still has the mallocs, avoids the func() call
 | 
|
1026  | 
            # 0.350us   iter(key_filter).next()
 | 
|
1027  | 
            #       has a func() call, and mallocs an iterator
 | 
|
1028  | 
            # 0.125us   for key in key_filter: pass
 | 
|
1029  | 
            #       no func() overhead, might malloc an iterator
 | 
|
1030  | 
            # 0.105us   for key in key_filter: break
 | 
|
1031  | 
            #       no func() overhead, might malloc an iterator, probably
 | 
|
1032  | 
            #       avoids checking an 'else' clause as part of the for
 | 
|
1033  | 
for key in key_filter:  | 
|
1034  | 
                break
 | 
|
1035  | 
search_prefix = self._search_prefix_filter(key)  | 
|
1036  | 
if len(search_prefix) == self._node_width:  | 
|
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1037  | 
                # This item will match exactly, so just do a dict lookup, and
 | 
1038  | 
                # see what we can return
 | 
|
1039  | 
shortcut = True  | 
|
1040  | 
try:  | 
|
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1041  | 
node = self._items[search_prefix]  | 
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1042  | 
except KeyError:  | 
1043  | 
                    # A given key can only match 1 child node, if it isn't
 | 
|
1044  | 
                    # there, then we can just return nothing
 | 
|
1045  | 
                    return
 | 
|
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1046  | 
if node.__class__ is tuple:  | 
1047  | 
keys[node] = (search_prefix, [key])  | 
|
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1048  | 
else:  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1049  | 
                    # This is loaded, and the only thing that can match,
 | 
1050  | 
                    # return
 | 
|
1051  | 
yield node, [key]  | 
|
1052  | 
                    return
 | 
|
| 
4413.4.1
by John Arbash Meinel
 Add a shortcut for the case when we are searching for a single full-width key.  | 
1053  | 
if not shortcut:  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1054  | 
            # First, convert all keys into a list of search prefixes
 | 
1055  | 
            # Aggregate common prefixes, and track the keys they come from
 | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1056  | 
prefix_to_keys = {}  | 
1057  | 
length_filters = {}  | 
|
1058  | 
for key in key_filter:  | 
|
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1059  | 
search_prefix = self._search_prefix_filter(key)  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1060  | 
length_filter = length_filters.setdefault(  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1061  | 
len(search_prefix), set())  | 
1062  | 
length_filter.add(search_prefix)  | 
|
1063  | 
prefix_to_keys.setdefault(search_prefix, []).append(key)  | 
|
1064  | 
||
1065  | 
if (self._node_width in length_filters  | 
|
1066  | 
and len(length_filters) == 1):  | 
|
1067  | 
                # all of the search prefixes match exactly _node_width. This
 | 
|
1068  | 
                # means that everything is an exact match, and we can do a
 | 
|
1069  | 
                # lookup into self._items, rather than iterating over the items
 | 
|
1070  | 
                # dict.
 | 
|
1071  | 
search_prefixes = length_filters[self._node_width]  | 
|
1072  | 
for search_prefix in search_prefixes:  | 
|
1073  | 
try:  | 
|
1074  | 
node = self._items[search_prefix]  | 
|
1075  | 
except KeyError:  | 
|
1076  | 
                        # We can ignore this one
 | 
|
1077  | 
                        continue
 | 
|
1078  | 
node_key_filter = prefix_to_keys[search_prefix]  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1079  | 
if node.__class__ is tuple:  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1080  | 
keys[node] = (search_prefix, node_key_filter)  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1081  | 
else:  | 
1082  | 
yield node, node_key_filter  | 
|
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1083  | 
else:  | 
1084  | 
                # The slow way. We walk every item in self._items, and check to
 | 
|
1085  | 
                # see if there are any matches
 | 
|
1086  | 
length_filters = length_filters.items()  | 
|
1087  | 
for prefix, node in self._items.iteritems():  | 
|
1088  | 
node_key_filter = []  | 
|
1089  | 
for length, length_filter in length_filters:  | 
|
1090  | 
sub_prefix = prefix[:length]  | 
|
1091  | 
if sub_prefix in length_filter:  | 
|
1092  | 
node_key_filter.extend(prefix_to_keys[sub_prefix])  | 
|
1093  | 
if node_key_filter: # this key matched something, yield it  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1094  | 
if node.__class__ is tuple:  | 
| 
4413.4.2
by John Arbash Meinel
 Rewrite the shortcuts.  | 
1095  | 
keys[node] = (prefix, node_key_filter)  | 
1096  | 
else:  | 
|
1097  | 
yield node, node_key_filter  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1098  | 
if keys:  | 
1099  | 
            # Look in the page cache for some more bytes
 | 
|
1100  | 
found_keys = set()  | 
|
1101  | 
for key in keys:  | 
|
1102  | 
try:  | 
|
1103  | 
bytes = _page_cache[key]  | 
|
1104  | 
except KeyError:  | 
|
1105  | 
                    continue
 | 
|
1106  | 
else:  | 
|
1107  | 
node = _deserialise(bytes, key,  | 
|
1108  | 
search_key_func=self._search_key_func)  | 
|
1109  | 
prefix, node_key_filter = keys[key]  | 
|
1110  | 
self._items[prefix] = node  | 
|
1111  | 
found_keys.add(key)  | 
|
1112  | 
yield node, node_key_filter  | 
|
1113  | 
for key in found_keys:  | 
|
1114  | 
del keys[key]  | 
|
1115  | 
if keys:  | 
|
1116  | 
            # demand load some pages.
 | 
|
1117  | 
if batch_size is None:  | 
|
1118  | 
                # Read all the keys in
 | 
|
1119  | 
batch_size = len(keys)  | 
|
1120  | 
key_order = list(keys)  | 
|
1121  | 
for batch_start in range(0, len(key_order), batch_size):  | 
|
1122  | 
batch = key_order[batch_start:batch_start + batch_size]  | 
|
1123  | 
                # We have to fully consume the stream so there is no pending
 | 
|
1124  | 
                # I/O, so we buffer the nodes for now.
 | 
|
1125  | 
stream = store.get_record_stream(batch, 'unordered', True)  | 
|
1126  | 
node_and_filters = []  | 
|
1127  | 
for record in stream:  | 
|
1128  | 
bytes = record.get_bytes_as('fulltext')  | 
|
1129  | 
node = _deserialise(bytes, record.key,  | 
|
1130  | 
search_key_func=self._search_key_func)  | 
|
1131  | 
prefix, node_key_filter = keys[record.key]  | 
|
1132  | 
node_and_filters.append((node, node_key_filter))  | 
|
1133  | 
self._items[prefix] = node  | 
|
1134  | 
_page_cache.add(record.key, bytes)  | 
|
1135  | 
for info in node_and_filters:  | 
|
1136  | 
yield info  | 
|
1137  | 
||
1138  | 
def map(self, store, key, value):  | 
|
1139  | 
"""Map key to value."""  | 
|
1140  | 
if not len(self._items):  | 
|
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
1141  | 
raise AssertionError("can't map in an empty InternalNode.")  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1142  | 
search_key = self._search_key(key)  | 
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
1143  | 
if self._node_width != len(self._search_prefix) + 1:  | 
1144  | 
raise AssertionError("node width mismatch: %d is not %d" %  | 
|
1145  | 
(self._node_width, len(self._search_prefix) + 1))  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1146  | 
if not search_key.startswith(self._search_prefix):  | 
1147  | 
            # This key doesn't fit in this index, so we need to split at the
 | 
|
1148  | 
            # point where it would fit, insert self into that internal node,
 | 
|
1149  | 
            # and then map this key into that node.
 | 
|
1150  | 
new_prefix = self.common_prefix(self._search_prefix,  | 
|
1151  | 
search_key)  | 
|
1152  | 
new_parent = InternalNode(new_prefix,  | 
|
1153  | 
search_key_func=self._search_key_func)  | 
|
1154  | 
new_parent.set_maximum_size(self._maximum_size)  | 
|
1155  | 
new_parent._key_width = self._key_width  | 
|
1156  | 
new_parent.add_node(self._search_prefix[:len(new_prefix)+1],  | 
|
1157  | 
self)  | 
|
1158  | 
return new_parent.map(store, key, value)  | 
|
1159  | 
children = [node for node, _  | 
|
1160  | 
in self._iter_nodes(store, key_filter=[key])]  | 
|
1161  | 
if children:  | 
|
1162  | 
child = children[0]  | 
|
1163  | 
else:  | 
|
1164  | 
            # new child needed:
 | 
|
1165  | 
child = self._new_child(search_key, LeafNode)  | 
|
1166  | 
old_len = len(child)  | 
|
1167  | 
if type(child) is LeafNode:  | 
|
1168  | 
old_size = child._current_size()  | 
|
1169  | 
else:  | 
|
1170  | 
old_size = None  | 
|
1171  | 
prefix, node_details = child.map(store, key, value)  | 
|
1172  | 
if len(node_details) == 1:  | 
|
1173  | 
            # child may have shrunk, or might be a new node
 | 
|
1174  | 
child = node_details[0][1]  | 
|
1175  | 
self._len = self._len - old_len + len(child)  | 
|
1176  | 
self._items[search_key] = child  | 
|
1177  | 
self._key = None  | 
|
1178  | 
new_node = self  | 
|
1179  | 
if type(child) is LeafNode:  | 
|
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
1180  | 
if old_size is None:  | 
1181  | 
                    # The old node was an InternalNode which means it has now
 | 
|
1182  | 
                    # collapsed, so we need to check if it will chain to a
 | 
|
1183  | 
                    # collapse at this level.
 | 
|
1184  | 
trace.mutter("checking remap as InternalNode -> LeafNode")  | 
|
1185  | 
new_node = self._check_remap(store)  | 
|
1186  | 
else:  | 
|
1187  | 
                    # If the LeafNode has shrunk in size, we may want to run
 | 
|
1188  | 
                    # a remap check. Checking for a remap is expensive though
 | 
|
1189  | 
                    # and the frequency of a successful remap is very low.
 | 
|
1190  | 
                    # Shrinkage by small amounts is common, so we only do the
 | 
|
1191  | 
                    # remap check if the new_size is low or the shrinkage
 | 
|
1192  | 
                    # amount is over a configurable limit.
 | 
|
1193  | 
new_size = child._current_size()  | 
|
1194  | 
shrinkage = old_size - new_size  | 
|
1195  | 
if (shrinkage > 0 and new_size < _INTERESTING_NEW_SIZE  | 
|
1196  | 
or shrinkage > _INTERESTING_SHRINKAGE_LIMIT):  | 
|
1197  | 
trace.mutter(  | 
|
1198  | 
"checking remap as size shrunk by %d to be %d",  | 
|
1199  | 
shrinkage, new_size)  | 
|
1200  | 
new_node = self._check_remap(store)  | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
1201  | 
if new_node._search_prefix is None:  | 
1202  | 
raise AssertionError("_search_prefix should not be None")  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1203  | 
return new_node._search_prefix, [('', new_node)]  | 
1204  | 
        # child has overflown - create a new intermediate node.
 | 
|
1205  | 
        # XXX: This is where we might want to try and expand our depth
 | 
|
1206  | 
        # to refer to more bytes of every child (which would give us
 | 
|
1207  | 
        # multiple pointers to child nodes, but less intermediate nodes)
 | 
|
1208  | 
child = self._new_child(search_key, InternalNode)  | 
|
1209  | 
child._search_prefix = prefix  | 
|
1210  | 
for split, node in node_details:  | 
|
1211  | 
child.add_node(split, node)  | 
|
1212  | 
self._len = self._len - old_len + len(child)  | 
|
1213  | 
self._key = None  | 
|
1214  | 
return self._search_prefix, [("", self)]  | 
|
1215  | 
||
1216  | 
def _new_child(self, search_key, klass):  | 
|
1217  | 
"""Create a new child node of type klass."""  | 
|
1218  | 
child = klass()  | 
|
1219  | 
child.set_maximum_size(self._maximum_size)  | 
|
1220  | 
child._key_width = self._key_width  | 
|
1221  | 
child._search_key_func = self._search_key_func  | 
|
1222  | 
self._items[search_key] = child  | 
|
1223  | 
return child  | 
|
1224  | 
||
1225  | 
def serialise(self, store):  | 
|
1226  | 
"""Serialise the node to store.  | 
|
1227  | 
||
1228  | 
        :param store: A VersionedFiles honouring the CHK extensions.
 | 
|
1229  | 
        :return: An iterable of the keys inserted by this operation.
 | 
|
1230  | 
        """
 | 
|
1231  | 
for node in self._items.itervalues():  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1232  | 
if type(node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1233  | 
                # Never deserialised.
 | 
1234  | 
                continue
 | 
|
1235  | 
if node._key is not None:  | 
|
1236  | 
                # Never altered
 | 
|
1237  | 
                continue
 | 
|
1238  | 
for key in node.serialise(store):  | 
|
1239  | 
yield key  | 
|
1240  | 
lines = ["chknode:\n"]  | 
|
1241  | 
lines.append("%d\n" % self._maximum_size)  | 
|
1242  | 
lines.append("%d\n" % self._key_width)  | 
|
1243  | 
lines.append("%d\n" % self._len)  | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
1244  | 
if self._search_prefix is None:  | 
1245  | 
raise AssertionError("_search_prefix should not be None")  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1246  | 
lines.append('%s\n' % (self._search_prefix,))  | 
1247  | 
prefix_len = len(self._search_prefix)  | 
|
1248  | 
for prefix, node in sorted(self._items.items()):  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1249  | 
if type(node) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1250  | 
key = node[0]  | 
1251  | 
else:  | 
|
1252  | 
key = node._key[0]  | 
|
1253  | 
serialised = "%s\x00%s\n" % (prefix, key)  | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
1254  | 
if not serialised.startswith(self._search_prefix):  | 
1255  | 
raise AssertionError("prefixes mismatch: %s must start with %s"  | 
|
1256  | 
% (serialised, self._search_prefix))  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1257  | 
lines.append(serialised[prefix_len:])  | 
1258  | 
sha1, _, _ = store.add_lines((None,), (), lines)  | 
|
1259  | 
self._key = ("sha1:" + sha1,)  | 
|
1260  | 
_page_cache.add(self._key, ''.join(lines))  | 
|
1261  | 
yield self._key  | 
|
1262  | 
||
1263  | 
def _search_key(self, key):  | 
|
1264  | 
"""Return the serialised key for key in this node."""  | 
|
1265  | 
        # search keys are fixed width. All will be self._node_width wide, so we
 | 
|
1266  | 
        # pad as necessary.
 | 
|
1267  | 
return (self._search_key_func(key) + '\x00'*self._node_width)[:self._node_width]  | 
|
1268  | 
||
1269  | 
def _search_prefix_filter(self, key):  | 
|
1270  | 
"""Serialise key for use as a prefix filter in iteritems."""  | 
|
1271  | 
return self._search_key_func(key)[:self._node_width]  | 
|
1272  | 
||
1273  | 
def _split(self, offset):  | 
|
1274  | 
"""Split this node into smaller nodes starting at offset.  | 
|
1275  | 
||
1276  | 
        :param offset: The offset to start the new child nodes at.
 | 
|
1277  | 
        :return: An iterable of (prefix, node) tuples. prefix is a byte
 | 
|
1278  | 
            prefix for reaching node.
 | 
|
1279  | 
        """
 | 
|
1280  | 
if offset >= self._node_width:  | 
|
1281  | 
for node in self._items.values():  | 
|
1282  | 
for result in node._split(offset):  | 
|
1283  | 
yield result  | 
|
1284  | 
            return
 | 
|
1285  | 
for key, node in self._items.items():  | 
|
1286  | 
            pass
 | 
|
1287  | 
||
1288  | 
def refs(self):  | 
|
1289  | 
"""Return the references to other CHK's held by this node."""  | 
|
1290  | 
if self._key is None:  | 
|
1291  | 
raise AssertionError("unserialised nodes have no refs.")  | 
|
1292  | 
refs = []  | 
|
1293  | 
for value in self._items.itervalues():  | 
|
| 
4413.4.4
by John Arbash Meinel
 Fix some type() == tuple to be 'type() is tuple' or '.__class__ is tuple'  | 
1294  | 
if type(value) is tuple:  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1295  | 
refs.append(value)  | 
1296  | 
else:  | 
|
1297  | 
refs.append(value.key())  | 
|
1298  | 
return refs  | 
|
1299  | 
||
1300  | 
def _compute_search_prefix(self, extra_key=None):  | 
|
1301  | 
"""Return the unique key prefix for this node.  | 
|
1302  | 
||
1303  | 
        :return: A bytestring of the longest search key prefix that is
 | 
|
1304  | 
            unique within this node.
 | 
|
1305  | 
        """
 | 
|
1306  | 
self._search_prefix = self.common_prefix_for_keys(self._items)  | 
|
1307  | 
return self._search_prefix  | 
|
1308  | 
||
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
1309  | 
def unmap(self, store, key, check_remap=True):  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1310  | 
"""Remove key from this node and it's children."""  | 
1311  | 
if not len(self._items):  | 
|
| 
3735.2.126
by Ian Clatworthy
 replace asserts in chk_map.py with AssertionErrors  | 
1312  | 
raise AssertionError("can't unmap in an empty InternalNode.")  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1313  | 
children = [node for node, _  | 
1314  | 
in self._iter_nodes(store, key_filter=[key])]  | 
|
1315  | 
if children:  | 
|
1316  | 
child = children[0]  | 
|
1317  | 
else:  | 
|
1318  | 
raise KeyError(key)  | 
|
1319  | 
self._len -= 1  | 
|
1320  | 
unmapped = child.unmap(store, key)  | 
|
1321  | 
self._key = None  | 
|
1322  | 
search_key = self._search_key(key)  | 
|
1323  | 
if len(unmapped) == 0:  | 
|
1324  | 
            # All child nodes are gone, remove the child:
 | 
|
1325  | 
del self._items[search_key]  | 
|
1326  | 
unmapped = None  | 
|
1327  | 
else:  | 
|
1328  | 
            # Stash the returned node
 | 
|
1329  | 
self._items[search_key] = unmapped  | 
|
1330  | 
if len(self._items) == 1:  | 
|
1331  | 
            # this node is no longer needed:
 | 
|
1332  | 
return self._items.values()[0]  | 
|
1333  | 
if type(unmapped) is InternalNode:  | 
|
1334  | 
return self  | 
|
| 
3735.2.122
by Ian Clatworthy
 don't check_remap on every unmap call in CHKMap.apply_delta()  | 
1335  | 
if check_remap:  | 
1336  | 
return self._check_remap(store)  | 
|
1337  | 
else:  | 
|
1338  | 
return self  | 
|
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1339  | 
|
1340  | 
def _check_remap(self, store):  | 
|
1341  | 
"""Check if all keys contained by children fit in a single LeafNode.  | 
|
1342  | 
||
1343  | 
        :param store: A store to use for reading more nodes
 | 
|
1344  | 
        :return: Either self, or a new LeafNode which should replace self.
 | 
|
1345  | 
        """
 | 
|
1346  | 
        # Logic for how we determine when we need to rebuild
 | 
|
1347  | 
        # 1) Implicitly unmap() is removing a key which means that the child
 | 
|
1348  | 
        #    nodes are going to be shrinking by some extent.
 | 
|
1349  | 
        # 2) If all children are LeafNodes, it is possible that they could be
 | 
|
1350  | 
        #    combined into a single LeafNode, which can then completely replace
 | 
|
1351  | 
        #    this internal node with a single LeafNode
 | 
|
1352  | 
        # 3) If *one* child is an InternalNode, we assume it has already done
 | 
|
1353  | 
        #    all the work to determine that its children cannot collapse, and
 | 
|
1354  | 
        #    we can then assume that those nodes *plus* the current nodes don't
 | 
|
1355  | 
        #    have a chance of collapsing either.
 | 
|
1356  | 
        #    So a very cheap check is to just say if 'unmapped' is an
 | 
|
1357  | 
        #    InternalNode, we don't have to check further.
 | 
|
1358  | 
||
1359  | 
        # TODO: Another alternative is to check the total size of all known
 | 
|
1360  | 
        #       LeafNodes. If there is some formula we can use to determine the
 | 
|
1361  | 
        #       final size without actually having to read in any more
 | 
|
1362  | 
        #       children, it would be nice to have. However, we have to be
 | 
|
1363  | 
        #       careful with stuff like nodes that pull out the common prefix
 | 
|
1364  | 
        #       of each key, as adding a new key can change the common prefix
 | 
|
1365  | 
        #       and cause size changes greater than the length of one key.
 | 
|
1366  | 
        #       So for now, we just add everything to a new Leaf until it
 | 
|
1367  | 
        #       splits, as we know that will give the right answer
 | 
|
1368  | 
new_leaf = LeafNode(search_key_func=self._search_key_func)  | 
|
1369  | 
new_leaf.set_maximum_size(self._maximum_size)  | 
|
1370  | 
new_leaf._key_width = self._key_width  | 
|
1371  | 
        # A batch_size of 16 was chosen because:
 | 
|
1372  | 
        #   a) In testing, a 4k page held 14 times. So if we have more than 16
 | 
|
1373  | 
        #      leaf nodes we are unlikely to hold them in a single new leaf
 | 
|
1374  | 
        #      node. This still allows for 1 round trip
 | 
|
1375  | 
        #   b) With 16-way fan out, we can still do a single round trip
 | 
|
1376  | 
        #   c) With 255-way fan out, we don't want to read all 255 and destroy
 | 
|
1377  | 
        #      the page cache, just to determine that we really don't need it.
 | 
|
1378  | 
for node, _ in self._iter_nodes(store, batch_size=16):  | 
|
1379  | 
if type(node) is InternalNode:  | 
|
1380  | 
                # Without looking at any leaf nodes, we are sure
 | 
|
1381  | 
return self  | 
|
1382  | 
for key, value in node._items.iteritems():  | 
|
1383  | 
if new_leaf._map_no_split(key, value):  | 
|
1384  | 
return self  | 
|
| 
3735.2.123
by Ian Clatworthy
 only check for remap if changes are interesting in size  | 
1385  | 
trace.mutter("remap generated a new LeafNode")  | 
| 
4241.6.1
by Ian Clatworthy
 chk_map code from brisbane-core  | 
1386  | 
return new_leaf  | 
1387  | 
||
1388  | 
||
1389  | 
def _deserialise(bytes, key, search_key_func):  | 
|
1390  | 
"""Helper for repositorydetails - convert bytes to a node."""  | 
|
1391  | 
if bytes.startswith("chkleaf:\n"):  | 
|
1392  | 
node = LeafNode.deserialise(bytes, key, search_key_func=search_key_func)  | 
|
1393  | 
elif bytes.startswith("chknode:\n"):  | 
|
1394  | 
node = InternalNode.deserialise(bytes, key,  | 
|
1395  | 
search_key_func=search_key_func)  | 
|
1396  | 
else:  | 
|
1397  | 
raise AssertionError("Unknown node type.")  | 
|
1398  | 
return node  | 
|
1399  | 
||
1400  | 
||
1401  | 
def _find_children_info(store, interesting_keys, uninteresting_keys, pb):  | 
|
1402  | 
"""Read the associated records, and determine what is interesting."""  | 
|
1403  | 
uninteresting_keys = set(uninteresting_keys)  | 
|
1404  | 
chks_to_read = uninteresting_keys.union(interesting_keys)  | 
|
1405  | 
next_uninteresting = set()  | 
|
1406  | 
next_interesting = set()  | 
|
1407  | 
uninteresting_items = set()  | 
|
1408  | 
interesting_items = set()  | 
|
1409  | 
interesting_to_yield = []  | 
|
1410  | 
for record in store.get_record_stream(chks_to_read, 'unordered', True):  | 
|
1411  | 
        # records_read.add(record.key())
 | 
|
1412  | 
if pb is not None:  | 
|
1413  | 
pb.tick()  | 
|
1414  | 
bytes = record.get_bytes_as('fulltext')  | 
|
1415  | 
        # We don't care about search_key_func for this code, because we only
 | 
|
1416  | 
        # care about external references.
 | 
|
1417  | 
node = _deserialise(bytes, record.key, search_key_func=None)  | 
|
1418  | 
if record.key in uninteresting_keys:  | 
|
1419  | 
if type(node) is InternalNode:  | 
|
1420  | 
next_uninteresting.update(node.refs())  | 
|
1421  | 
else:  | 
|
1422  | 
                # We know we are at a LeafNode, so we can pass None for the
 | 
|
1423  | 
                # store
 | 
|
1424  | 
uninteresting_items.update(node.iteritems(None))  | 
|
1425  | 
else:  | 
|
1426  | 
interesting_to_yield.append(record.key)  | 
|
1427  | 
if type(node) is InternalNode:  | 
|
1428  | 
next_interesting.update(node.refs())  | 
|
1429  | 
else:  | 
|
1430  | 
interesting_items.update(node.iteritems(None))  | 
|
1431  | 
return (next_uninteresting, uninteresting_items,  | 
|
1432  | 
next_interesting, interesting_to_yield, interesting_items)  | 
|
1433  | 
||
1434  | 
||
1435  | 
def _find_all_uninteresting(store, interesting_root_keys,  | 
|
1436  | 
uninteresting_root_keys, pb):  | 
|
1437  | 
"""Determine the full set of uninteresting keys."""  | 
|
1438  | 
    # What about duplicates between interesting_root_keys and
 | 
|
1439  | 
    # uninteresting_root_keys?
 | 
|
1440  | 
if not uninteresting_root_keys:  | 
|
1441  | 
        # Shortcut case. We know there is nothing uninteresting to filter out
 | 
|
1442  | 
        # So we just let the rest of the algorithm do the work
 | 
|
1443  | 
        # We know there is nothing uninteresting, and we didn't have to read
 | 
|
1444  | 
        # any interesting records yet.
 | 
|
1445  | 
return (set(), set(), set(interesting_root_keys), [], set())  | 
|
1446  | 
all_uninteresting_chks = set(uninteresting_root_keys)  | 
|
1447  | 
all_uninteresting_items = set()  | 
|
1448  | 
||
1449  | 
    # First step, find the direct children of both the interesting and
 | 
|
1450  | 
    # uninteresting set
 | 
|
1451  | 
(uninteresting_keys, uninteresting_items,  | 
|
1452  | 
interesting_keys, interesting_to_yield,  | 
|
1453  | 
interesting_items) = _find_children_info(store, interesting_root_keys,  | 
|
1454  | 
uninteresting_root_keys,  | 
|
1455  | 
pb=pb)  | 
|
1456  | 
all_uninteresting_chks.update(uninteresting_keys)  | 
|
1457  | 
all_uninteresting_items.update(uninteresting_items)  | 
|
1458  | 
del uninteresting_items  | 
|
1459  | 
    # Note: Exact matches between interesting and uninteresting do not need
 | 
|
1460  | 
    #       to be search further. Non-exact matches need to be searched in case
 | 
|
1461  | 
    #       there is a future exact-match
 | 
|
1462  | 
uninteresting_keys.difference_update(interesting_keys)  | 
|
1463  | 
||
1464  | 
    # Second, find the full set of uninteresting bits reachable by the
 | 
|
1465  | 
    # uninteresting roots
 | 
|
1466  | 
chks_to_read = uninteresting_keys  | 
|
1467  | 
while chks_to_read:  | 
|
1468  | 
next_chks = set()  | 
|
1469  | 
for record in store.get_record_stream(chks_to_read, 'unordered', False):  | 
|
1470  | 
            # TODO: Handle 'absent'
 | 
|
1471  | 
if pb is not None:  | 
|
1472  | 
pb.tick()  | 
|
1473  | 
bytes = record.get_bytes_as('fulltext')  | 
|
1474  | 
            # We don't care about search_key_func for this code, because we
 | 
|
1475  | 
            # only care about external references.
 | 
|
1476  | 
node = _deserialise(bytes, record.key, search_key_func=None)  | 
|
1477  | 
if type(node) is InternalNode:  | 
|
1478  | 
                # uninteresting_prefix_chks.update(node._items.iteritems())
 | 
|
1479  | 
chks = node._items.values()  | 
|
1480  | 
                # TODO: We remove the entries that are already in
 | 
|
1481  | 
                #       uninteresting_chks ?
 | 
|
1482  | 
next_chks.update(chks)  | 
|
1483  | 
all_uninteresting_chks.update(chks)  | 
|
1484  | 
else:  | 
|
1485  | 
all_uninteresting_items.update(node._items.iteritems())  | 
|
1486  | 
chks_to_read = next_chks  | 
|
1487  | 
return (all_uninteresting_chks, all_uninteresting_items,  | 
|
1488  | 
interesting_keys, interesting_to_yield, interesting_items)  | 
|
1489  | 
||
1490  | 
||
1491  | 
def iter_interesting_nodes(store, interesting_root_keys,  | 
|
1492  | 
uninteresting_root_keys, pb=None):  | 
|
1493  | 
"""Given root keys, find interesting nodes.  | 
|
1494  | 
||
1495  | 
    Evaluate nodes referenced by interesting_root_keys. Ones that are also
 | 
|
1496  | 
    referenced from uninteresting_root_keys are not considered interesting.
 | 
|
1497  | 
||
1498  | 
    :param interesting_root_keys: keys which should be part of the
 | 
|
1499  | 
        "interesting" nodes (which will be yielded)
 | 
|
1500  | 
    :param uninteresting_root_keys: keys which should be filtered out of the
 | 
|
1501  | 
        result set.
 | 
|
1502  | 
    :return: Yield
 | 
|
1503  | 
        (interesting record, {interesting key:values})
 | 
|
1504  | 
    """
 | 
|
1505  | 
    # TODO: consider that it may be more memory efficient to use the 20-byte
 | 
|
1506  | 
    #       sha1 string, rather than tuples of hexidecimal sha1 strings.
 | 
|
1507  | 
    # TODO: Try to factor out a lot of the get_record_stream() calls into a
 | 
|
1508  | 
    #       helper function similar to _read_bytes. This function should be
 | 
|
1509  | 
    #       able to use nodes from the _page_cache as well as actually
 | 
|
1510  | 
    #       requesting bytes from the store.
 | 
|
1511  | 
||
1512  | 
(all_uninteresting_chks, all_uninteresting_items, interesting_keys,  | 
|
1513  | 
interesting_to_yield, interesting_items) = _find_all_uninteresting(store,  | 
|
1514  | 
interesting_root_keys, uninteresting_root_keys, pb)  | 
|
1515  | 
||
1516  | 
    # Now that we know everything uninteresting, we can yield information from
 | 
|
1517  | 
    # our first request
 | 
|
1518  | 
interesting_items.difference_update(all_uninteresting_items)  | 
|
1519  | 
interesting_to_yield = set(interesting_to_yield) - all_uninteresting_chks  | 
|
1520  | 
if interesting_items:  | 
|
1521  | 
yield None, interesting_items  | 
|
1522  | 
if interesting_to_yield:  | 
|
1523  | 
        # We request these records again, rather than buffering the root
 | 
|
1524  | 
        # records, most likely they are still in the _group_cache anyway.
 | 
|
1525  | 
for record in store.get_record_stream(interesting_to_yield,  | 
|
1526  | 
'unordered', False):  | 
|
1527  | 
yield record, []  | 
|
1528  | 
all_uninteresting_chks.update(interesting_to_yield)  | 
|
1529  | 
interesting_keys.difference_update(all_uninteresting_chks)  | 
|
1530  | 
||
1531  | 
chks_to_read = interesting_keys  | 
|
1532  | 
counter = 0  | 
|
1533  | 
while chks_to_read:  | 
|
1534  | 
next_chks = set()  | 
|
1535  | 
for record in store.get_record_stream(chks_to_read, 'unordered', False):  | 
|
1536  | 
counter += 1  | 
|
1537  | 
if pb is not None:  | 
|
1538  | 
pb.update('find chk pages', counter)  | 
|
1539  | 
            # TODO: Handle 'absent'?
 | 
|
1540  | 
bytes = record.get_bytes_as('fulltext')  | 
|
1541  | 
            # We don't care about search_key_func for this code, because we
 | 
|
1542  | 
            # only care about external references.
 | 
|
1543  | 
node = _deserialise(bytes, record.key, search_key_func=None)  | 
|
1544  | 
if type(node) is InternalNode:  | 
|
1545  | 
                # all_uninteresting_chks grows large, as it lists all nodes we
 | 
|
1546  | 
                # don't want to process (including already seen interesting
 | 
|
1547  | 
                # nodes).
 | 
|
1548  | 
                # small.difference_update(large) scales O(large), but
 | 
|
1549  | 
                # small.difference(large) scales O(small).
 | 
|
1550  | 
                # Also, we know we just _deserialised this node, so we can
 | 
|
1551  | 
                # access the dict directly.
 | 
|
1552  | 
chks = set(node._items.itervalues()).difference(  | 
|
1553  | 
all_uninteresting_chks)  | 
|
1554  | 
                # Is set() and .difference_update better than:
 | 
|
1555  | 
                # chks = [chk for chk in node.refs()
 | 
|
1556  | 
                #              if chk not in all_uninteresting_chks]
 | 
|
1557  | 
next_chks.update(chks)  | 
|
1558  | 
                # These are now uninteresting everywhere else
 | 
|
1559  | 
all_uninteresting_chks.update(chks)  | 
|
1560  | 
interesting_items = []  | 
|
1561  | 
else:  | 
|
1562  | 
interesting_items = [item for item in node._items.iteritems()  | 
|
1563  | 
if item not in all_uninteresting_items]  | 
|
1564  | 
                # TODO: Do we need to filter out items that we have already
 | 
|
1565  | 
                #       seen on other pages? We don't really want to buffer the
 | 
|
1566  | 
                #       whole thing, but it does mean that callers need to
 | 
|
1567  | 
                #       understand they may get duplicate values.
 | 
|
1568  | 
                # all_uninteresting_items.update(interesting_items)
 | 
|
1569  | 
yield record, interesting_items  | 
|
1570  | 
chks_to_read = next_chks  | 
|
1571  | 
||
1572  | 
||
1573  | 
try:  | 
|
1574  | 
from bzrlib._chk_map_pyx import (  | 
|
1575  | 
_search_key_16,  | 
|
1576  | 
_search_key_255,  | 
|
1577  | 
_deserialise_leaf_node,  | 
|
1578  | 
_deserialise_internal_node,  | 
|
1579  | 
        )
 | 
|
1580  | 
except ImportError:  | 
|
1581  | 
from bzrlib._chk_map_py import (  | 
|
1582  | 
_search_key_16,  | 
|
1583  | 
_search_key_255,  | 
|
1584  | 
_deserialise_leaf_node,  | 
|
1585  | 
_deserialise_internal_node,  | 
|
1586  | 
        )
 | 
|
1587  | 
search_key_registry.register('hash-16-way', _search_key_16)  | 
|
1588  | 
search_key_registry.register('hash-255-way', _search_key_255)  |