/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
17
"""Persistent maps from tuple_of_strings->string using CHK stores.
18
19
Overview and current status:
20
21
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
22
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
23
joining them by \x00, and \x00 padding shorter keys out to the length of the
24
longest key. Leaf nodes are packed as densely as possible, and internal nodes
25
are all and additional 8-bits wide leading to a sparse upper tree. 
26
27
Updates to a CHKMap are done preferentially via the apply_delta method, to
28
allow optimisation of the update operation; but individual map/unmap calls are
29
possible and supported. All changes via map/unmap are buffered in memory until
30
the _save method is called to force serialisation of the tree. apply_delta
31
performs a _save implicitly.
32
33
TODO:
34
-----
35
36
Densely packed upper nodes.
37
38
"""
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
39
3735.2.31 by Robert Collins
CHKMap.iter_changes
40
import heapq
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
41
import osutils
42
43
44
class CHKMap(object):
45
    """A persistent map from string to string backed by a CHK store."""
46
47
    def __init__(self, store, root_key):
48
        """Create a CHKMap object.
49
50
        :param store: The store the CHKMap is stored in.
51
        :param root_key: The root key of the map. None to create an empty
52
            CHKMap.
53
        """
54
        self._store = store
55
        if root_key is None:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
56
            self._root_node = LeafNode()
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
57
        else:
3735.2.41 by Robert Collins
Make the parent_id_basename index be updated during CHKInventory.apply_delta.
58
            self._root_node = self._node_key(root_key)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
59
60
    def apply_delta(self, delta):
61
        """Apply a delta to the map.
62
63
        :param delta: An iterable of old_key, new_key, new_value tuples.
64
            If new_key is not None, then new_key->new_value is inserted
65
            into the map; if old_key is not None, then the old mapping
66
            of old_key is removed.
67
        """
68
        for old, new, value in delta:
69
            if old is not None and old != new:
70
                # unmap
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
71
                self.unmap(old)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
72
        for old, new, value in delta:
73
            if new is not None:
74
                # map
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
75
                self.map(new, value)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
76
        return self._save()
77
78
    def _ensure_root(self):
79
        """Ensure that the root node is an object not a key."""
80
        if type(self._root_node) == tuple:
81
            # Demand-load the root
3735.2.31 by Robert Collins
CHKMap.iter_changes
82
            self._root_node = self._get_node(self._root_node)
83
84
    def _get_node(self, node):
85
        """Get a node.
86
87
        Node that this does not update the _items dict in objects containing a
88
        reference to this node. As such it does not prevent subsequent IO being
89
        performed.
90
        
91
        :param node: A tuple key or node object.
92
        :return: A node object.
93
        """
94
        if type(node) == tuple:
95
            bytes = self._read_bytes(node)
96
            return _deserialise(bytes, node)
97
        else:
98
            return node
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
99
100
    def _read_bytes(self, key):
101
        stream = self._store.get_record_stream([key], 'unordered', True)
102
        return stream.next().get_bytes_as('fulltext')
103
104
    @classmethod
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
105
    def from_dict(klass, store, initial_value, maximum_size=0, key_width=1):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
106
        """Create a CHKMap in store with initial_value as the content.
107
        
108
        :param store: The store to record initial_value in, a VersionedFiles
109
            object with 1-tuple keys supporting CHK key generation.
110
        :param initial_value: A dict to store in store. Its keys and values
111
            must be bytestrings.
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
112
        :param maximum_size: The maximum_size rule to apply to nodes. This
113
            determines the size at which no new data is added to a single node.
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
114
        :param key_width: The number of elements in each key_tuple being stored
115
            in this map.
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
116
        :return: The root chk of te resulting CHKMap.
117
        """
118
        result = CHKMap(store, None)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
119
        result._root_node.set_maximum_size(maximum_size)
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
120
        result._root_node._key_width = key_width
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
121
        delta = []
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
122
        for key, value in initial_value.items():
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
123
            delta.append((None, key, value))
124
        result.apply_delta(delta)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
125
        return result._save()
126
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
127
    def iter_changes(self, basis):
128
        """Iterate over the changes between basis and self.
129
130
        :return: An iterator of tuples: (key, old_value, new_value). Old_value
131
            is None for keys only in self; new_value is None for keys only in
132
            basis.
133
        """
3735.2.31 by Robert Collins
CHKMap.iter_changes
134
        # Overview:
135
        # Read both trees in lexographic, highest-first order.
136
        # Any identical nodes we skip
137
        # Any unique prefixes we output immediately.
138
        # values in a leaf node are treated as single-value nodes in the tree
139
        # which allows them to be not-special-cased. We know to output them
140
        # because their value is a string, not a key(tuple) or node.
141
        #
142
        # corner cases to beware of when considering this function:
143
        # *) common references are at different heights.
144
        #    consider two trees:
145
        #    {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
146
        #    {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'}, 'ab':LeafNode={'ab':'bar'}}
147
        #     'b': LeafNode={'b'}}
148
        #    the node with aaa/aab will only be encountered in the second tree
149
        #    after reading the 'a' subtree, but it is encountered in the first
150
        #    tree immediately. Variations on this may have read internal nodes like this.
151
        #    we want to cut the entire pending subtree when we realise we have a common node.
152
        #    For this we use a list of keys - the path to a node - and check the entire path is 
153
        #    clean as we process each item.
154
        if self._node_key(self._root_node) == self._node_key(basis._root_node):
155
            return
156
        self._ensure_root()
157
        basis._ensure_root()
158
        excluded_keys = set()
159
        self_node = self._root_node
160
        basis_node = basis._root_node
161
        # A heap, each element is prefix, node(tuple/NodeObject/string),
162
        # key_path (a list of tuples, tail-sharing down the tree.)
163
        self_pending = []
164
        basis_pending = []
165
        def process_node(prefix, node, path, a_map, pending):
166
            # take a node and expand it
167
            node = a_map._get_node(node)
168
            if type(node) == LeafNode:
169
                path = (node._key, path)
170
                for key, value in node._items.items():
171
                    heapq.heappush(pending, ('\x00'.join(key), value, path))
172
            else:
173
                # type(node) == InternalNode
174
                path = (node._key, path)
175
                for prefix, child in node._items.items():
176
                    heapq.heappush(pending, (prefix, child, path))
177
        process_node(None, self_node, None, self, self_pending)
178
        process_node(None, basis_node, None, basis, basis_pending)
179
        self_seen = set()
180
        basis_seen = set()
181
        excluded_keys = set()
182
        def check_excluded(key_path):
183
            # Note that this is N^2, it depends on us trimming trees
184
            # aggressively to not become slow.
185
            # A better implementation would probably have a reverse map
186
            # back to the children of a node, and jump straight to it when 
187
            # a common node is detected, the proceed to remove the already
188
            # pending children. bzrlib.graph has a searcher module with a
189
            # similar problem.
190
            while key_path is not None:
191
                key, key_path = key_path
192
                if key in excluded_keys:
193
                    return True
194
            return False
195
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
196
        loop_counter = 0
3735.2.31 by Robert Collins
CHKMap.iter_changes
197
        while self_pending or basis_pending:
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
198
            loop_counter += 1
3735.2.31 by Robert Collins
CHKMap.iter_changes
199
            if not self_pending:
200
                # self is exhausted: output remainder of basis
201
                for prefix, node, path in basis_pending:
202
                    if check_excluded(path):
203
                        continue
204
                    node = basis._get_node(node)
205
                    if type(node) == str:
206
                        # a value
207
                        yield (tuple(prefix.split('\x00')), node, None)
208
                    else:
209
                        # subtree - fastpath the entire thing.
210
                        for key, value in node.iteritems(basis._store):
211
                            yield (key, value, None)
212
                return
213
            elif not basis_pending:
214
                # basis is exhausted: output remainder of self.
215
                for prefix, node, path in self_pending:
216
                    if check_excluded(path):
217
                        continue
218
                    node = self._get_node(node)
219
                    if type(node) == str:
220
                        # a value
221
                        yield (tuple(prefix.split('\x00')), None, node)
222
                    else:
223
                        # subtree - fastpath the entire thing.
224
                        for key, value in node.iteritems(self._store):
225
                            yield (key, None, value)
226
                return
227
            else:
228
                # XXX: future optimisation - yield the smaller items
229
                # immediately rather than pushing everything on/off the
230
                # heaps. Applies to both internal nodes and leafnodes.
231
                if self_pending[0][0] < basis_pending[0][0]:
232
                    # expand self
233
                    prefix, node, path = heapq.heappop(self_pending)
234
                    if check_excluded(path):
235
                        continue
236
                    if type(node) == str:
237
                        # a value
238
                        yield (tuple(prefix.split('\x00')), None, node)
239
                    else:
240
                        process_node(prefix, node, path, self, self_pending)
241
                        continue
242
                elif self_pending[0][0] > basis_pending[0][0]:
243
                    # expand basis
244
                    prefix, node, path = heapq.heappop(basis_pending)
245
                    if check_excluded(path):
246
                        continue
247
                    if type(node) == str:
248
                        # a value
249
                        yield (tuple(prefix.split('\x00')), node, None)
250
                    else:
251
                        process_node(prefix, node, path, basis, basis_pending)
252
                        continue
253
                else:
254
                    # common prefix: possibly expand both
255
                    if type(self_pending[0][1]) != str:
256
                        # process next self
257
                        read_self = True
258
                    else:
259
                        read_self = False
260
                    if type(basis_pending[0][1]) != str:
261
                        # process next basis
262
                        read_basis = True
263
                    else:
264
                        read_basis = False
265
                    if not read_self and not read_basis:
266
                        # compare a common value
267
                        self_details = heapq.heappop(self_pending)
268
                        basis_details = heapq.heappop(basis_pending)
269
                        if self_details[1] != basis_details[1]:
270
                            yield (tuple(self_details[0].split('\x00')),
271
                                basis_details[1], self_details[1])
272
                        continue
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
273
                    # At least one side wasn't a string.
274
                    if (self._node_key(self_pending[0][1]) ==
275
                        self._node_key(basis_pending[0][1])):
276
                        # Identical pointers, skip (and don't bother adding to
277
                        # excluded, it won't turn up again.
278
                        heapq.heappop(self_pending)
279
                        heapq.heappop(basis_pending)
280
                        continue
281
                    # Now we need to expand this node before we can continue
3735.2.31 by Robert Collins
CHKMap.iter_changes
282
                    if read_self:
283
                        prefix, node, path = heapq.heappop(self_pending)
284
                        if check_excluded(path):
285
                            continue
286
                        process_node(prefix, node, path, self, self_pending)
287
                    if read_basis:
288
                        prefix, node, path = heapq.heappop(basis_pending)
289
                        if check_excluded(path):
290
                            continue
291
                        process_node(prefix, node, path, basis, basis_pending)
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
292
        # print loop_counter
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
293
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
294
    def iteritems(self, key_filter=None):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
295
        """Iterate over the entire CHKMap's contents."""
296
        self._ensure_root()
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
297
        return self._root_node.iteritems(self._store, key_filter=key_filter)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
298
3735.2.12 by Robert Collins
Implement commit-via-deltas for split inventory repositories.
299
    def key(self):
300
        """Return the key for this map."""
301
        if isinstance(self._root_node, tuple):
302
            return self._root_node
303
        else:
304
            return self._root_node._key
305
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
306
    def __len__(self):
307
        self._ensure_root()
308
        return len(self._root_node)
309
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
310
    def map(self, key, value):
311
        """Map a key tuple to value."""
312
        # Need a root object.
313
        self._ensure_root()
314
        prefix, node_details = self._root_node.map(self._store, key, value)
315
        if len(node_details) == 1:
316
            self._root_node = node_details[0][1]
317
        else:
318
            self._root_node = InternalNode()
319
            self._root_node.set_maximum_size(node_details[0][1].maximum_size)
320
            self._root_node._key_width = node_details[0][1]._key_width
321
            for split, node in node_details:
322
                self._root_node.add_node(split, node)
323
3735.2.31 by Robert Collins
CHKMap.iter_changes
324
    def _node_key(self, node):
325
        """Get the key for a node whether its a tuple o r node."""
326
        if type(node) == tuple:
327
            return node
328
        else:
329
            return node._key
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
330
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
331
    def unmap(self, key):
332
        """remove key from the map."""
333
        self._ensure_root()
334
        self._root_node.unmap(self._store, key)
335
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
336
    def _save(self):
337
        """Save the map completely.
338
339
        :return: The key of the root node.
340
        """
341
        if type(self._root_node) == tuple:
342
            # Already saved.
343
            return self._root_node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
344
        keys = list(self._root_node.serialise(self._store))
345
        return keys[-1]
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
346
347
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
348
class Node(object):
349
    """Base class defining the protocol for CHK Map nodes."""
350
351
    def __init__(self, key_width=1):
352
        """Create a node.
353
354
        :param key_width: The width of keys for this node.
355
        """
356
        self._key = None
357
        # Current number of elements
358
        self._len = 0
359
        self._maximum_size = 0
360
        self._key_width = 1
361
        # current size in bytes
362
        self._size = 0
363
        # The pointers/values this node has - meaning defined by child classes.
364
        self._items = {}
365
3735.2.38 by Robert Collins
Sufficient fixes to allow bzr-search to index a dev3 format repository.
366
    def key(self):
367
        return self._key
368
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
369
    def __len__(self):
370
        return self._len
371
372
    @property
373
    def maximum_size(self):
374
        """What is the upper limit for adding references to a node."""
375
        return self._maximum_size
376
377
    def set_maximum_size(self, new_size):
378
        """Set the size threshold for nodes.
379
380
        :param new_size: The size at which no data is added to a node. 0 for
381
            unlimited.
382
        """
383
        self._maximum_size = new_size
384
385
386
class LeafNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
387
    """A node containing actual key:value pairs.
388
    
389
    :ivar _items: A dict of key->value items. The key is in tuple form.
390
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
391
392
    def __init__(self):
393
        Node.__init__(self)
394
        # The size of a leaf node with default values and no children.
395
        self._size = 12
396
397
    def _current_size(self):
398
        """Answer the current serialised size of this node."""
399
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
400
            len(str(self._maximum_size)))
401
402
    @classmethod
403
    def deserialise(klass, bytes, key):
404
        """Deserialise bytes, with key key, into a LeafNode.
405
406
        :param bytes: The bytes of the node.
407
        :param key: The key that the serialised node has.
408
        """
409
        result = LeafNode()
410
        lines = bytes.splitlines()
411
        items = {}
412
        if lines[0] != 'chkleaf:':
413
            raise ValueError("not a serialised leaf node: %r" % bytes)
414
        maximum_size = int(lines[1])
415
        width = int(lines[2])
416
        length = int(lines[3])
417
        for line in lines[4:]:
3735.2.25 by Robert Collins
CHKInventory core tests passing.
418
            elements = line.split('\x00', width)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
419
            items[tuple(elements[:-1])] = elements[-1]
420
        if len(items) != length:
421
            raise AssertionError("item count mismatch")
422
        result._items = items
423
        result._len = length
424
        result._maximum_size = maximum_size
425
        result._key = key
426
        result._key_width = width
427
        result._size = len(bytes)
428
        return result
429
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
430
    def iteritems(self, store, key_filter=None):
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
431
        """Iterate over items in the node.
432
433
        :param key_filter: A filter to apply to the node. It should be a
434
            list/set/dict or similar repeatedly iterable container.
435
        """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
436
        if key_filter is not None:
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
437
            # Adjust the filter - short elements go to a prefix filter. Would this
438
            # be cleaner explicitly? That would be no harder for InternalNode..
439
            # XXX: perhaps defaultdict? Profiling<rinse and repeat>
440
            filters = {}
441
            for key in key_filter:
442
                length_filter = filters.setdefault(len(key), set())
443
                length_filter.add(key)
444
            filters = filters.items()
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
445
            for item in self._items.iteritems():
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
446
                for length, length_filter in filters:
447
                    if item[0][:length] in length_filter:
448
                        yield item
449
                        break
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
450
        else:
451
            for item in self._items.iteritems():
452
                yield item
453
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
454
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
455
        """Map key to value."""
456
        if key in self._items:
457
            self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
458
            self._len -= 1
459
        self._items[key] = value
460
        self._size += 2 + len('\x00'.join(key)) + len(value)
461
        self._len += 1
462
        self._key = None
463
        if (self._maximum_size and self._current_size() > self._maximum_size and
464
            self._len > 1):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
465
            common_prefix = self.unique_serialised_prefix()
466
            split_at = len(common_prefix) + 1
467
            result = {}
468
            for key, value in self._items.iteritems():
469
                serialised_key = self._serialised_key(key)
470
                prefix = serialised_key[:split_at]
471
                if prefix not in result:
472
                    node = LeafNode()
473
                    node.set_maximum_size(self._maximum_size)
474
                    node._key_width = self._key_width
475
                    result[prefix] = node
476
                else:
477
                    node = result[prefix]
478
                node.map(store, key, value)
479
            return common_prefix, result.items()
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
480
        else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
481
            return self.unique_serialised_prefix(), [("", self)]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
482
483
    def serialise(self, store):
484
        """Serialise the tree to store.
485
486
        :param store: A VersionedFiles honouring the CHK extensions.
487
        :return: An iterable of the keys inserted by this operation.
488
        """
489
        lines = ["chkleaf:\n"]
490
        lines.append("%d\n" % self._maximum_size)
491
        lines.append("%d\n" % self._key_width)
492
        lines.append("%d\n" % self._len)
493
        for key, value in sorted(self._items.items()):
494
            lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
495
        sha1, _, _ = store.add_lines((None,), (), lines)
496
        self._key = ("sha1:" + sha1,)
497
        return [self._key]
498
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
499
    def _serialised_key(self, key):
500
        """Return the serialised key for key in this node."""
501
        return '\x00'.join(key)
502
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
503
    def refs(self):
504
        """Return the references to other CHK's held by this node."""
505
        return []
506
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
507
    def unique_serialised_prefix(self):
508
        """Return the unique key prefix for this node.
509
510
        :return: A bytestring of the longest serialised key prefix that is
511
            unique within this node.
512
        """
513
        # may want to cache this eventually :- but wait for enough
514
        # functionality to profile.
515
        keys = list(self._items.keys())
516
        if not keys:
517
            return ""
518
        current_prefix = self._serialised_key(keys.pop(-1))
519
        while current_prefix and keys:
520
            next_key = self._serialised_key(keys.pop(-1))
521
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
522
                if left != right:
523
                    pos -= 1
524
                    break
525
            current_prefix = current_prefix[:pos + 1]
526
        return current_prefix
527
528
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
529
        """Unmap key from the node."""
530
        self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
531
        self._len -= 1
532
        del self._items[key]
533
        self._key = None
534
        return self
535
536
537
class InternalNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
538
    """A node that contains references to other nodes.
539
    
540
    An InternalNode is responsible for mapping serialised key prefixes to child
541
    nodes. It is greedy - it will defer splitting itself as long as possible.
542
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
543
544
    def __init__(self):
545
        Node.__init__(self)
546
        # The size of an internalnode with default values and no children.
547
        # self._size = 12
548
        # How many octets key prefixes within this node are.
549
        self._node_width = 0
550
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
551
    def add_node(self, prefix, node):
552
        """Add a child node with prefix prefix, and node node.
553
554
        :param prefix: The serialised key prefix for node.
555
        :param node: The node being added.
556
        """
557
        self._len += len(node)
558
        if not len(self._items):
559
            self._node_width = len(prefix)
560
        self._items[prefix] = node
561
        self._key = None
562
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
563
    def _current_size(self):
564
        """Answer the current serialised size of this node."""
565
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
566
            len(str(self._maximum_size)))
567
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
568
    @classmethod
569
    def deserialise(klass, bytes, key):
3735.2.25 by Robert Collins
CHKInventory core tests passing.
570
        """Deserialise bytes to an InternalNode, with key key.
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
571
572
        :param bytes: The bytes of the node.
573
        :param key: The key that the serialised node has.
574
        :return: An InternalNode instance.
575
        """
576
        result = InternalNode()
577
        lines = bytes.splitlines()
578
        items = {}
579
        if lines[0] != 'chknode:':
580
            raise ValueError("not a serialised internal node: %r" % bytes)
581
        maximum_size = int(lines[1])
582
        width = int(lines[2])
583
        length = int(lines[3])
584
        for line in lines[4:]:
585
            prefix, flat_key = line.rsplit('\x00', 1)
586
            items[prefix] = (flat_key,)
587
        result._items = items
588
        result._len = length
589
        result._maximum_size = maximum_size
590
        result._key = key
591
        result._key_width = width
592
        result._size = len(bytes)
593
        result._node_width = len(prefix)
594
        return result
595
596
    def iteritems(self, store, key_filter=None):
597
        for node in self._iter_nodes(store, key_filter=key_filter):
598
            for item in node.iteritems(store, key_filter=key_filter):
599
                yield item
600
601
    def _iter_nodes(self, store, key_filter=None):
602
        """Iterate over node objects which match key_filter.
603
604
        :param store: A store to use for accessing content.
605
        :param key_filter: A key filter to filter nodes. Only nodes that might
606
            contain a key in key_filter will be returned.
607
        :return: An iterable of nodes.
608
        """
609
        nodes = []
3735.2.31 by Robert Collins
CHKMap.iter_changes
610
        keys = {}
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
611
        if key_filter is None:
3735.2.31 by Robert Collins
CHKMap.iter_changes
612
            for prefix, node in self._items.iteritems():
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
613
                if type(node) == tuple:
3735.2.31 by Robert Collins
CHKMap.iter_changes
614
                    keys[node] = prefix
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
615
                else:
616
                    nodes.append(node)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
617
        else:
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
618
            # XXX defaultdict ?
619
            length_filters = {}
620
            for key in key_filter:
621
                serialised_key = self._serialised_prefix_filter(key)
622
                length_filter = length_filters.setdefault(len(serialised_key),
623
                    set())
624
                length_filter.add(serialised_key)
625
            length_filters = length_filters.items()
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
626
            for prefix, node in self._items.iteritems():
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
627
                for length, length_filter in length_filters:
628
                    if prefix[:length] in length_filter:
629
                        if type(node) == tuple:
630
                            keys[node] = prefix
631
                        else:
632
                            nodes.append(node)
633
                        break
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
634
        if keys:
635
            # demand load some pages.
636
            stream = store.get_record_stream(keys, 'unordered', True)
637
            for record in stream:
638
                node = _deserialise(record.get_bytes_as('fulltext'), record.key)
639
                nodes.append(node)
3735.2.31 by Robert Collins
CHKMap.iter_changes
640
                self._items[keys[record.key]] = node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
641
        return nodes
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
642
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
643
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
644
        """Map key to value."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
645
        if not len(self._items):
646
            raise AssertionError("cant map in an empty InternalNode.")
647
        children = self._iter_nodes(store, key_filter=[key])
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
648
        serialised_key = self._serialised_key(key)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
649
        if children:
650
            child = children[0]
651
        else:
652
            # new child needed:
653
            child = self._new_child(serialised_key, LeafNode)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
654
        old_len = len(child)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
655
        prefix, node_details = child.map(store, key, value)
656
        if len(node_details) == 1:
657
            # child may have shrunk, or might be the same.
658
            self._len = self._len - old_len + len(child)
659
            self._items[serialised_key] = child
3735.2.29 by Robert Collins
Untested code is broken code.
660
            self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
661
            return self.unique_serialised_prefix(), [("", self)]
662
        # child has overflown - create a new intermediate node.
663
        # XXX: This is where we might want to try and expand our depth
664
        # to refer to more bytes of every child (which would give us
665
        # multiple pointers to child nodes, but less intermediate nodes)
666
        child = self._new_child(serialised_key, InternalNode)
667
        for split, node in node_details:
668
            child.add_node(split, node)
669
        self._len = self._len - old_len + len(child)
3735.2.29 by Robert Collins
Untested code is broken code.
670
        self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
671
        return self.unique_serialised_prefix(), [("", self)]
672
673
    def _new_child(self, serialised_key, klass):
674
        """Create a new child node of type klass."""
675
        child = klass()
676
        child.set_maximum_size(self._maximum_size)
677
        child._key_width = self._key_width
678
        self._items[serialised_key] = child
679
        return child
680
681
    def serialise(self, store):
682
        """Serialise the node to store.
683
684
        :param store: A VersionedFiles honouring the CHK extensions.
685
        :return: An iterable of the keys inserted by this operation.
686
        """
687
        for node in self._items.itervalues():
688
            if type(node) == tuple:
689
                # Never deserialised.
690
                continue
691
            if node._key is not None:
692
                # Never altered
693
                continue
694
            for key in node.serialise(store):
695
                yield key
696
        lines = ["chknode:\n"]
697
        lines.append("%d\n" % self._maximum_size)
698
        lines.append("%d\n" % self._key_width)
699
        lines.append("%d\n" % self._len)
700
        for prefix, node in sorted(self._items.items()):
701
            if type(node) == tuple:
702
                key = node[0]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
703
            else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
704
                key = node._key[0]
705
            lines.append("%s\x00%s\n" % (prefix, key))
706
        sha1, _, _ = store.add_lines((None,), (), lines)
707
        self._key = ("sha1:" + sha1,)
708
        yield self._key
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
709
710
    def _serialised_key(self, key):
711
        """Return the serialised key for key in this node."""
712
        return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
713
3735.2.43 by Robert Collins
Teach CHKMap how to iter items in 2-tuple keyspaces.
714
    def _serialised_prefix_filter(self, key):
715
        """Serialise key for use as a prefix filter in iteritems."""
716
        if len(key) == self._key_width:
717
            return self._serialised_key(key)
718
        return '\x00'.join(key)[:self._node_width]
719
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
720
    def _split(self, offset):
721
        """Split this node into smaller nodes starting at offset.
722
723
        :param offset: The offset to start the new child nodes at.
724
        :return: An iterable of (prefix, node) tuples. prefix is a byte
725
            prefix for reaching node.
726
        """
727
        if offset >= self._node_width:
728
            for node in self._items.values():
729
                for result in node._split(offset):
730
                    yield result
731
            return
732
        for key, node in self._items.items():
733
            pass
734
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
735
    def refs(self):
736
        """Return the references to other CHK's held by this node."""
737
        if self._key is None:
738
            raise AssertionError("unserialised nodes have no refs.")
739
        refs = []
740
        for value in self._items.itervalues():
741
            if type(value) == tuple:
742
                refs.append(value)
743
            else:
744
                refs.append(value.key())
745
        return refs
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
746
747
    def unique_serialised_prefix(self):
748
        """Return the unique key prefix for this node.
749
750
        :return: A bytestring of the longest serialised key prefix that is
751
            unique within this node.
752
        """
753
        # may want to cache this eventually :- but wait for enough
754
        # functionality to profile.
755
        keys = list(self._items.keys())
756
        if not keys:
757
            return ""
758
        current_prefix = keys.pop(-1)
759
        while current_prefix and keys:
760
            next_key = keys.pop(-1)
761
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
762
                if left != right:
763
                    pos -= 1
764
                    break
765
            current_prefix = current_prefix[:pos + 1]
766
        return current_prefix
767
768
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
769
        """Remove key from this node and it's children."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
770
        if not len(self._items):
771
            raise AssertionError("cant unmap in an empty InternalNode.")
772
        serialised_key = self._serialised_key(key)
773
        children = self._iter_nodes(store, key_filter=[key])
774
        serialised_key = self._serialised_key(key)
775
        if children:
776
            child = children[0]
777
        else:
778
            raise KeyError(key)
779
        self._len -= 1
780
        unmapped = child.unmap(store, key)
781
        if len(unmapped) == 0:
782
            # All child nodes are gone, remove the child:
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
783
            del self._items[serialised_key]
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
784
        else:
785
            # Stash the returned node
786
            self._items[serialised_key] = unmapped
3735.2.23 by Robert Collins
Test unmapping with one child left but multiple keys.
787
        if len(self._items) == 1:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
788
            # this node is no longer needed:
789
            return self._items.values()[0]
3735.2.29 by Robert Collins
Untested code is broken code.
790
        self._key = None
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
791
        return self
792
793
3735.2.16 by Robert Collins
Untested extensions to support repodetails
794
def _deserialise(bytes, key):
795
    """Helper for repositorydetails - convert bytes to a node."""
3735.2.24 by Robert Collins
test_chk_map tests all passing.
796
    if bytes.startswith("chkleaf:\n"):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
797
        return LeafNode.deserialise(bytes, key)
798
    elif bytes.startswith("chknode:\n"):
799
        return InternalNode.deserialise(bytes, key)
3735.2.16 by Robert Collins
Untested extensions to support repodetails
800
    else:
801
        raise AssertionError("Unknown node type.")