/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
17
"""Persistent maps from tuple_of_strings->string using CHK stores.
18
19
Overview and current status:
20
21
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
22
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
23
joining them by \x00, and \x00 padding shorter keys out to the length of the
24
longest key. Leaf nodes are packed as densely as possible, and internal nodes
25
are all and additional 8-bits wide leading to a sparse upper tree. 
26
27
Updates to a CHKMap are done preferentially via the apply_delta method, to
28
allow optimisation of the update operation; but individual map/unmap calls are
29
possible and supported. All changes via map/unmap are buffered in memory until
30
the _save method is called to force serialisation of the tree. apply_delta
31
performs a _save implicitly.
32
33
TODO:
34
-----
35
36
Densely packed upper nodes.
37
38
"""
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
39
3735.2.31 by Robert Collins
CHKMap.iter_changes
40
import heapq
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
41
import osutils
42
43
44
class CHKMap(object):
45
    """A persistent map from string to string backed by a CHK store."""
46
47
    def __init__(self, store, root_key):
48
        """Create a CHKMap object.
49
50
        :param store: The store the CHKMap is stored in.
51
        :param root_key: The root key of the map. None to create an empty
52
            CHKMap.
53
        """
54
        self._store = store
55
        if root_key is None:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
56
            self._root_node = LeafNode()
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
57
        else:
58
            self._root_node = root_key
59
60
    def apply_delta(self, delta):
61
        """Apply a delta to the map.
62
63
        :param delta: An iterable of old_key, new_key, new_value tuples.
64
            If new_key is not None, then new_key->new_value is inserted
65
            into the map; if old_key is not None, then the old mapping
66
            of old_key is removed.
67
        """
68
        for old, new, value in delta:
69
            if old is not None and old != new:
70
                # unmap
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
71
                self.unmap(old)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
72
        for old, new, value in delta:
73
            if new is not None:
74
                # map
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
75
                self.map(new, value)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
76
        return self._save()
77
78
    def _ensure_root(self):
79
        """Ensure that the root node is an object not a key."""
80
        if type(self._root_node) == tuple:
81
            # Demand-load the root
3735.2.31 by Robert Collins
CHKMap.iter_changes
82
            self._root_node = self._get_node(self._root_node)
83
84
    def _get_node(self, node):
85
        """Get a node.
86
87
        Node that this does not update the _items dict in objects containing a
88
        reference to this node. As such it does not prevent subsequent IO being
89
        performed.
90
        
91
        :param node: A tuple key or node object.
92
        :return: A node object.
93
        """
94
        if type(node) == tuple:
95
            bytes = self._read_bytes(node)
96
            return _deserialise(bytes, node)
97
        else:
98
            return node
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
99
100
    def _read_bytes(self, key):
101
        stream = self._store.get_record_stream([key], 'unordered', True)
102
        return stream.next().get_bytes_as('fulltext')
103
104
    @classmethod
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
105
    def from_dict(klass, store, initial_value, maximum_size=0):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
106
        """Create a CHKMap in store with initial_value as the content.
107
        
108
        :param store: The store to record initial_value in, a VersionedFiles
109
            object with 1-tuple keys supporting CHK key generation.
110
        :param initial_value: A dict to store in store. Its keys and values
111
            must be bytestrings.
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
112
        :param maximum_size: The maximum_size rule to apply to nodes. This
113
            determines the size at which no new data is added to a single node.
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
114
        :return: The root chk of te resulting CHKMap.
115
        """
116
        result = CHKMap(store, None)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
117
        result._root_node.set_maximum_size(maximum_size)
118
        delta = []
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
119
        for key, value in initial_value.items():
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
120
            delta.append((None, key, value))
121
        result.apply_delta(delta)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
122
        return result._save()
123
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
124
    def iter_changes(self, basis):
125
        """Iterate over the changes between basis and self.
126
127
        :return: An iterator of tuples: (key, old_value, new_value). Old_value
128
            is None for keys only in self; new_value is None for keys only in
129
            basis.
130
        """
3735.2.31 by Robert Collins
CHKMap.iter_changes
131
        # Overview:
132
        # Read both trees in lexographic, highest-first order.
133
        # Any identical nodes we skip
134
        # Any unique prefixes we output immediately.
135
        # values in a leaf node are treated as single-value nodes in the tree
136
        # which allows them to be not-special-cased. We know to output them
137
        # because their value is a string, not a key(tuple) or node.
138
        #
139
        # corner cases to beware of when considering this function:
140
        # *) common references are at different heights.
141
        #    consider two trees:
142
        #    {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
143
        #    {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'}, 'ab':LeafNode={'ab':'bar'}}
144
        #     'b': LeafNode={'b'}}
145
        #    the node with aaa/aab will only be encountered in the second tree
146
        #    after reading the 'a' subtree, but it is encountered in the first
147
        #    tree immediately. Variations on this may have read internal nodes like this.
148
        #    we want to cut the entire pending subtree when we realise we have a common node.
149
        #    For this we use a list of keys - the path to a node - and check the entire path is 
150
        #    clean as we process each item.
151
        if self._node_key(self._root_node) == self._node_key(basis._root_node):
152
            return
153
        self._ensure_root()
154
        basis._ensure_root()
155
        excluded_keys = set()
156
        self_node = self._root_node
157
        basis_node = basis._root_node
158
        # A heap, each element is prefix, node(tuple/NodeObject/string),
159
        # key_path (a list of tuples, tail-sharing down the tree.)
160
        self_pending = []
161
        basis_pending = []
162
        def process_node(prefix, node, path, a_map, pending):
163
            # take a node and expand it
164
            node = a_map._get_node(node)
165
            if type(node) == LeafNode:
166
                path = (node._key, path)
167
                for key, value in node._items.items():
168
                    heapq.heappush(pending, ('\x00'.join(key), value, path))
169
            else:
170
                # type(node) == InternalNode
171
                path = (node._key, path)
172
                for prefix, child in node._items.items():
173
                    heapq.heappush(pending, (prefix, child, path))
174
        process_node(None, self_node, None, self, self_pending)
175
        process_node(None, basis_node, None, basis, basis_pending)
176
        self_seen = set()
177
        basis_seen = set()
178
        excluded_keys = set()
179
        def check_excluded(key_path):
180
            # Note that this is N^2, it depends on us trimming trees
181
            # aggressively to not become slow.
182
            # A better implementation would probably have a reverse map
183
            # back to the children of a node, and jump straight to it when 
184
            # a common node is detected, the proceed to remove the already
185
            # pending children. bzrlib.graph has a searcher module with a
186
            # similar problem.
187
            while key_path is not None:
188
                key, key_path = key_path
189
                if key in excluded_keys:
190
                    return True
191
            return False
192
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
193
        loop_counter = 0
3735.2.31 by Robert Collins
CHKMap.iter_changes
194
        while self_pending or basis_pending:
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
195
            loop_counter += 1
3735.2.31 by Robert Collins
CHKMap.iter_changes
196
            if not self_pending:
197
                # self is exhausted: output remainder of basis
198
                for prefix, node, path in basis_pending:
199
                    if check_excluded(path):
200
                        continue
201
                    node = basis._get_node(node)
202
                    if type(node) == str:
203
                        # a value
204
                        yield (tuple(prefix.split('\x00')), node, None)
205
                    else:
206
                        # subtree - fastpath the entire thing.
207
                        for key, value in node.iteritems(basis._store):
208
                            yield (key, value, None)
209
                return
210
            elif not basis_pending:
211
                # basis is exhausted: output remainder of self.
212
                for prefix, node, path in self_pending:
213
                    if check_excluded(path):
214
                        continue
215
                    node = self._get_node(node)
216
                    if type(node) == str:
217
                        # a value
218
                        yield (tuple(prefix.split('\x00')), None, node)
219
                    else:
220
                        # subtree - fastpath the entire thing.
221
                        for key, value in node.iteritems(self._store):
222
                            yield (key, None, value)
223
                return
224
            else:
225
                # XXX: future optimisation - yield the smaller items
226
                # immediately rather than pushing everything on/off the
227
                # heaps. Applies to both internal nodes and leafnodes.
228
                if self_pending[0][0] < basis_pending[0][0]:
229
                    # expand self
230
                    prefix, node, path = heapq.heappop(self_pending)
231
                    if check_excluded(path):
232
                        continue
233
                    if type(node) == str:
234
                        # a value
235
                        yield (tuple(prefix.split('\x00')), None, node)
236
                    else:
237
                        process_node(prefix, node, path, self, self_pending)
238
                        continue
239
                elif self_pending[0][0] > basis_pending[0][0]:
240
                    # expand basis
241
                    prefix, node, path = heapq.heappop(basis_pending)
242
                    if check_excluded(path):
243
                        continue
244
                    if type(node) == str:
245
                        # a value
246
                        yield (tuple(prefix.split('\x00')), node, None)
247
                    else:
248
                        process_node(prefix, node, path, basis, basis_pending)
249
                        continue
250
                else:
251
                    # common prefix: possibly expand both
252
                    if type(self_pending[0][1]) != str:
253
                        # process next self
254
                        read_self = True
255
                    else:
256
                        read_self = False
257
                    if type(basis_pending[0][1]) != str:
258
                        # process next basis
259
                        read_basis = True
260
                    else:
261
                        read_basis = False
262
                    if not read_self and not read_basis:
263
                        # compare a common value
264
                        self_details = heapq.heappop(self_pending)
265
                        basis_details = heapq.heappop(basis_pending)
266
                        if self_details[1] != basis_details[1]:
267
                            yield (tuple(self_details[0].split('\x00')),
268
                                basis_details[1], self_details[1])
269
                        continue
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
270
                    # At least one side wasn't a string.
271
                    if (self._node_key(self_pending[0][1]) ==
272
                        self._node_key(basis_pending[0][1])):
273
                        # Identical pointers, skip (and don't bother adding to
274
                        # excluded, it won't turn up again.
275
                        heapq.heappop(self_pending)
276
                        heapq.heappop(basis_pending)
277
                        continue
278
                    # Now we need to expand this node before we can continue
3735.2.31 by Robert Collins
CHKMap.iter_changes
279
                    if read_self:
280
                        prefix, node, path = heapq.heappop(self_pending)
281
                        if check_excluded(path):
282
                            continue
283
                        process_node(prefix, node, path, self, self_pending)
284
                    if read_basis:
285
                        prefix, node, path = heapq.heappop(basis_pending)
286
                        if check_excluded(path):
287
                            continue
288
                        process_node(prefix, node, path, basis, basis_pending)
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
289
        # print loop_counter
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
290
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
291
    def iteritems(self, key_filter=None):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
292
        """Iterate over the entire CHKMap's contents."""
293
        self._ensure_root()
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
294
        return self._root_node.iteritems(self._store, key_filter=key_filter)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
295
3735.2.12 by Robert Collins
Implement commit-via-deltas for split inventory repositories.
296
    def key(self):
297
        """Return the key for this map."""
298
        if isinstance(self._root_node, tuple):
299
            return self._root_node
300
        else:
301
            return self._root_node._key
302
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
303
    def __len__(self):
304
        self._ensure_root()
305
        return len(self._root_node)
306
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
307
    def map(self, key, value):
308
        """Map a key tuple to value."""
309
        # Need a root object.
310
        self._ensure_root()
311
        prefix, node_details = self._root_node.map(self._store, key, value)
312
        if len(node_details) == 1:
313
            self._root_node = node_details[0][1]
314
        else:
315
            self._root_node = InternalNode()
316
            self._root_node.set_maximum_size(node_details[0][1].maximum_size)
317
            self._root_node._key_width = node_details[0][1]._key_width
318
            for split, node in node_details:
319
                self._root_node.add_node(split, node)
320
3735.2.31 by Robert Collins
CHKMap.iter_changes
321
    def _node_key(self, node):
322
        """Get the key for a node whether its a tuple o r node."""
323
        if type(node) == tuple:
324
            return node
325
        else:
326
            return node._key
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
327
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
328
    def unmap(self, key):
329
        """remove key from the map."""
330
        self._ensure_root()
331
        self._root_node.unmap(self._store, key)
332
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
333
    def _save(self):
334
        """Save the map completely.
335
336
        :return: The key of the root node.
337
        """
338
        if type(self._root_node) == tuple:
339
            # Already saved.
340
            return self._root_node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
341
        keys = list(self._root_node.serialise(self._store))
342
        return keys[-1]
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
343
344
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
345
class Node(object):
346
    """Base class defining the protocol for CHK Map nodes."""
347
348
    def __init__(self, key_width=1):
349
        """Create a node.
350
351
        :param key_width: The width of keys for this node.
352
        """
353
        self._key = None
354
        # Current number of elements
355
        self._len = 0
356
        self._maximum_size = 0
357
        self._key_width = 1
358
        # current size in bytes
359
        self._size = 0
360
        # The pointers/values this node has - meaning defined by child classes.
361
        self._items = {}
362
3735.2.38 by Robert Collins
Sufficient fixes to allow bzr-search to index a dev3 format repository.
363
    def key(self):
364
        return self._key
365
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
366
    def __len__(self):
367
        return self._len
368
369
    @property
370
    def maximum_size(self):
371
        """What is the upper limit for adding references to a node."""
372
        return self._maximum_size
373
374
    def set_maximum_size(self, new_size):
375
        """Set the size threshold for nodes.
376
377
        :param new_size: The size at which no data is added to a node. 0 for
378
            unlimited.
379
        """
380
        self._maximum_size = new_size
381
382
383
class LeafNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
384
    """A node containing actual key:value pairs.
385
    
386
    :ivar _items: A dict of key->value items. The key is in tuple form.
387
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
388
389
    def __init__(self):
390
        Node.__init__(self)
391
        # The size of a leaf node with default values and no children.
392
        self._size = 12
393
394
    def _current_size(self):
395
        """Answer the current serialised size of this node."""
396
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
397
            len(str(self._maximum_size)))
398
399
    @classmethod
400
    def deserialise(klass, bytes, key):
401
        """Deserialise bytes, with key key, into a LeafNode.
402
403
        :param bytes: The bytes of the node.
404
        :param key: The key that the serialised node has.
405
        """
406
        result = LeafNode()
407
        lines = bytes.splitlines()
408
        items = {}
409
        if lines[0] != 'chkleaf:':
410
            raise ValueError("not a serialised leaf node: %r" % bytes)
411
        maximum_size = int(lines[1])
412
        width = int(lines[2])
413
        length = int(lines[3])
414
        for line in lines[4:]:
3735.2.25 by Robert Collins
CHKInventory core tests passing.
415
            elements = line.split('\x00', width)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
416
            items[tuple(elements[:-1])] = elements[-1]
417
        if len(items) != length:
418
            raise AssertionError("item count mismatch")
419
        result._items = items
420
        result._len = length
421
        result._maximum_size = maximum_size
422
        result._key = key
423
        result._key_width = width
424
        result._size = len(bytes)
425
        return result
426
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
427
    def iteritems(self, store, key_filter=None):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
428
        if key_filter is not None:
429
            for item in self._items.iteritems():
430
                if item[0] in key_filter:
431
                    yield item
432
        else:
433
            for item in self._items.iteritems():
434
                yield item
435
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
436
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
437
        """Map key to value."""
438
        if key in self._items:
439
            self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
440
            self._len -= 1
441
        self._items[key] = value
442
        self._size += 2 + len('\x00'.join(key)) + len(value)
443
        self._len += 1
444
        self._key = None
445
        if (self._maximum_size and self._current_size() > self._maximum_size and
446
            self._len > 1):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
447
            common_prefix = self.unique_serialised_prefix()
448
            split_at = len(common_prefix) + 1
449
            result = {}
450
            for key, value in self._items.iteritems():
451
                serialised_key = self._serialised_key(key)
452
                prefix = serialised_key[:split_at]
453
                if prefix not in result:
454
                    node = LeafNode()
455
                    node.set_maximum_size(self._maximum_size)
456
                    node._key_width = self._key_width
457
                    result[prefix] = node
458
                else:
459
                    node = result[prefix]
460
                node.map(store, key, value)
461
            return common_prefix, result.items()
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
462
        else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
463
            return self.unique_serialised_prefix(), [("", self)]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
464
465
    def serialise(self, store):
466
        """Serialise the tree to store.
467
468
        :param store: A VersionedFiles honouring the CHK extensions.
469
        :return: An iterable of the keys inserted by this operation.
470
        """
471
        lines = ["chkleaf:\n"]
472
        lines.append("%d\n" % self._maximum_size)
473
        lines.append("%d\n" % self._key_width)
474
        lines.append("%d\n" % self._len)
475
        for key, value in sorted(self._items.items()):
476
            lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
477
        sha1, _, _ = store.add_lines((None,), (), lines)
478
        self._key = ("sha1:" + sha1,)
479
        return [self._key]
480
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
481
    def _serialised_key(self, key):
482
        """Return the serialised key for key in this node."""
483
        return '\x00'.join(key)
484
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
485
    def refs(self):
486
        """Return the references to other CHK's held by this node."""
487
        return []
488
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
489
    def unique_serialised_prefix(self):
490
        """Return the unique key prefix for this node.
491
492
        :return: A bytestring of the longest serialised key prefix that is
493
            unique within this node.
494
        """
495
        # may want to cache this eventually :- but wait for enough
496
        # functionality to profile.
497
        keys = list(self._items.keys())
498
        if not keys:
499
            return ""
500
        current_prefix = self._serialised_key(keys.pop(-1))
501
        while current_prefix and keys:
502
            next_key = self._serialised_key(keys.pop(-1))
503
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
504
                if left != right:
505
                    pos -= 1
506
                    break
507
            current_prefix = current_prefix[:pos + 1]
508
        return current_prefix
509
510
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
511
        """Unmap key from the node."""
512
        self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
513
        self._len -= 1
514
        del self._items[key]
515
        self._key = None
516
        return self
517
518
519
class InternalNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
520
    """A node that contains references to other nodes.
521
    
522
    An InternalNode is responsible for mapping serialised key prefixes to child
523
    nodes. It is greedy - it will defer splitting itself as long as possible.
524
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
525
526
    def __init__(self):
527
        Node.__init__(self)
528
        # The size of an internalnode with default values and no children.
529
        # self._size = 12
530
        # How many octets key prefixes within this node are.
531
        self._node_width = 0
532
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
533
    def add_node(self, prefix, node):
534
        """Add a child node with prefix prefix, and node node.
535
536
        :param prefix: The serialised key prefix for node.
537
        :param node: The node being added.
538
        """
539
        self._len += len(node)
540
        if not len(self._items):
541
            self._node_width = len(prefix)
542
        self._items[prefix] = node
543
        self._key = None
544
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
545
    def _current_size(self):
546
        """Answer the current serialised size of this node."""
547
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
548
            len(str(self._maximum_size)))
549
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
550
    @classmethod
551
    def deserialise(klass, bytes, key):
3735.2.25 by Robert Collins
CHKInventory core tests passing.
552
        """Deserialise bytes to an InternalNode, with key key.
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
553
554
        :param bytes: The bytes of the node.
555
        :param key: The key that the serialised node has.
556
        :return: An InternalNode instance.
557
        """
558
        result = InternalNode()
559
        lines = bytes.splitlines()
560
        items = {}
561
        if lines[0] != 'chknode:':
562
            raise ValueError("not a serialised internal node: %r" % bytes)
563
        maximum_size = int(lines[1])
564
        width = int(lines[2])
565
        length = int(lines[3])
566
        for line in lines[4:]:
567
            prefix, flat_key = line.rsplit('\x00', 1)
568
            items[prefix] = (flat_key,)
569
        result._items = items
570
        result._len = length
571
        result._maximum_size = maximum_size
572
        result._key = key
573
        result._key_width = width
574
        result._size = len(bytes)
575
        result._node_width = len(prefix)
576
        return result
577
578
    def iteritems(self, store, key_filter=None):
579
        for node in self._iter_nodes(store, key_filter=key_filter):
580
            for item in node.iteritems(store, key_filter=key_filter):
581
                yield item
582
583
    def _iter_nodes(self, store, key_filter=None):
584
        """Iterate over node objects which match key_filter.
585
586
        :param store: A store to use for accessing content.
587
        :param key_filter: A key filter to filter nodes. Only nodes that might
588
            contain a key in key_filter will be returned.
589
        :return: An iterable of nodes.
590
        """
591
        nodes = []
3735.2.31 by Robert Collins
CHKMap.iter_changes
592
        keys = {}
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
593
        if key_filter is None:
3735.2.31 by Robert Collins
CHKMap.iter_changes
594
            for prefix, node in self._items.iteritems():
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
595
                if type(node) == tuple:
3735.2.31 by Robert Collins
CHKMap.iter_changes
596
                    keys[node] = prefix
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
597
                else:
598
                    nodes.append(node)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
599
        else:
600
            serialised_filter = set([self._serialised_key(key) for key in
601
                key_filter])
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
602
            for prefix, node in self._items.iteritems():
603
                if prefix in serialised_filter:
604
                    if type(node) == tuple:
3735.2.31 by Robert Collins
CHKMap.iter_changes
605
                        keys[node] = prefix
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
606
                    else:
607
                        nodes.append(node)
608
        if keys:
609
            # demand load some pages.
610
            stream = store.get_record_stream(keys, 'unordered', True)
611
            for record in stream:
612
                node = _deserialise(record.get_bytes_as('fulltext'), record.key)
613
                nodes.append(node)
3735.2.31 by Robert Collins
CHKMap.iter_changes
614
                self._items[keys[record.key]] = node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
615
        return nodes
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
616
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
617
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
618
        """Map key to value."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
619
        if not len(self._items):
620
            raise AssertionError("cant map in an empty InternalNode.")
621
        children = self._iter_nodes(store, key_filter=[key])
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
622
        serialised_key = self._serialised_key(key)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
623
        if children:
624
            child = children[0]
625
        else:
626
            # new child needed:
627
            child = self._new_child(serialised_key, LeafNode)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
628
        old_len = len(child)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
629
        prefix, node_details = child.map(store, key, value)
630
        if len(node_details) == 1:
631
            # child may have shrunk, or might be the same.
632
            self._len = self._len - old_len + len(child)
633
            self._items[serialised_key] = child
3735.2.29 by Robert Collins
Untested code is broken code.
634
            self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
635
            return self.unique_serialised_prefix(), [("", self)]
636
        # child has overflown - create a new intermediate node.
637
        # XXX: This is where we might want to try and expand our depth
638
        # to refer to more bytes of every child (which would give us
639
        # multiple pointers to child nodes, but less intermediate nodes)
640
        child = self._new_child(serialised_key, InternalNode)
641
        for split, node in node_details:
642
            child.add_node(split, node)
643
        self._len = self._len - old_len + len(child)
3735.2.29 by Robert Collins
Untested code is broken code.
644
        self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
645
        return self.unique_serialised_prefix(), [("", self)]
646
647
    def _new_child(self, serialised_key, klass):
648
        """Create a new child node of type klass."""
649
        child = klass()
650
        child.set_maximum_size(self._maximum_size)
651
        child._key_width = self._key_width
652
        self._items[serialised_key] = child
653
        return child
654
655
    def serialise(self, store):
656
        """Serialise the node to store.
657
658
        :param store: A VersionedFiles honouring the CHK extensions.
659
        :return: An iterable of the keys inserted by this operation.
660
        """
661
        for node in self._items.itervalues():
662
            if type(node) == tuple:
663
                # Never deserialised.
664
                continue
665
            if node._key is not None:
666
                # Never altered
667
                continue
668
            for key in node.serialise(store):
669
                yield key
670
        lines = ["chknode:\n"]
671
        lines.append("%d\n" % self._maximum_size)
672
        lines.append("%d\n" % self._key_width)
673
        lines.append("%d\n" % self._len)
674
        for prefix, node in sorted(self._items.items()):
675
            if type(node) == tuple:
676
                key = node[0]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
677
            else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
678
                key = node._key[0]
679
            lines.append("%s\x00%s\n" % (prefix, key))
680
        sha1, _, _ = store.add_lines((None,), (), lines)
681
        self._key = ("sha1:" + sha1,)
682
        yield self._key
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
683
684
    def _serialised_key(self, key):
685
        """Return the serialised key for key in this node."""
686
        return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
687
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
688
    def _split(self, offset):
689
        """Split this node into smaller nodes starting at offset.
690
691
        :param offset: The offset to start the new child nodes at.
692
        :return: An iterable of (prefix, node) tuples. prefix is a byte
693
            prefix for reaching node.
694
        """
695
        if offset >= self._node_width:
696
            for node in self._items.values():
697
                for result in node._split(offset):
698
                    yield result
699
            return
700
        for key, node in self._items.items():
701
            pass
702
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
703
    def refs(self):
704
        """Return the references to other CHK's held by this node."""
705
        if self._key is None:
706
            raise AssertionError("unserialised nodes have no refs.")
707
        refs = []
708
        for value in self._items.itervalues():
709
            if type(value) == tuple:
710
                refs.append(value)
711
            else:
712
                refs.append(value.key())
713
        return refs
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
714
715
    def unique_serialised_prefix(self):
716
        """Return the unique key prefix for this node.
717
718
        :return: A bytestring of the longest serialised key prefix that is
719
            unique within this node.
720
        """
721
        # may want to cache this eventually :- but wait for enough
722
        # functionality to profile.
723
        keys = list(self._items.keys())
724
        if not keys:
725
            return ""
726
        current_prefix = keys.pop(-1)
727
        while current_prefix and keys:
728
            next_key = keys.pop(-1)
729
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
730
                if left != right:
731
                    pos -= 1
732
                    break
733
            current_prefix = current_prefix[:pos + 1]
734
        return current_prefix
735
736
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
737
        """Remove key from this node and it's children."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
738
        if not len(self._items):
739
            raise AssertionError("cant unmap in an empty InternalNode.")
740
        serialised_key = self._serialised_key(key)
741
        children = self._iter_nodes(store, key_filter=[key])
742
        serialised_key = self._serialised_key(key)
743
        if children:
744
            child = children[0]
745
        else:
746
            raise KeyError(key)
747
        self._len -= 1
748
        unmapped = child.unmap(store, key)
749
        if len(unmapped) == 0:
750
            # All child nodes are gone, remove the child:
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
751
            del self._items[serialised_key]
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
752
        else:
753
            # Stash the returned node
754
            self._items[serialised_key] = unmapped
3735.2.23 by Robert Collins
Test unmapping with one child left but multiple keys.
755
        if len(self._items) == 1:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
756
            # this node is no longer needed:
757
            return self._items.values()[0]
3735.2.29 by Robert Collins
Untested code is broken code.
758
        self._key = None
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
759
        return self
760
761
3735.2.16 by Robert Collins
Untested extensions to support repodetails
762
def _deserialise(bytes, key):
763
    """Helper for repositorydetails - convert bytes to a node."""
3735.2.24 by Robert Collins
test_chk_map tests all passing.
764
    if bytes.startswith("chkleaf:\n"):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
765
        return LeafNode.deserialise(bytes, key)
766
    elif bytes.startswith("chknode:\n"):
767
        return InternalNode.deserialise(bytes, key)
3735.2.16 by Robert Collins
Untested extensions to support repodetails
768
    else:
769
        raise AssertionError("Unknown node type.")