/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
17
"""Persistent maps from tuple_of_strings->string using CHK stores.
18
19
Overview and current status:
20
21
The CHKMap class implements a dict from tuple_of_strings->string by using a trie
22
with internal nodes of 8-bit fan out; The key tuples are mapped to strings by
23
joining them by \x00, and \x00 padding shorter keys out to the length of the
24
longest key. Leaf nodes are packed as densely as possible, and internal nodes
25
are all and additional 8-bits wide leading to a sparse upper tree. 
26
27
Updates to a CHKMap are done preferentially via the apply_delta method, to
28
allow optimisation of the update operation; but individual map/unmap calls are
29
possible and supported. All changes via map/unmap are buffered in memory until
30
the _save method is called to force serialisation of the tree. apply_delta
31
performs a _save implicitly.
32
33
TODO:
34
-----
35
36
Densely packed upper nodes.
37
38
"""
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
39
3735.2.31 by Robert Collins
CHKMap.iter_changes
40
import heapq
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
41
import osutils
42
43
44
class CHKMap(object):
45
    """A persistent map from string to string backed by a CHK store."""
46
47
    def __init__(self, store, root_key):
48
        """Create a CHKMap object.
49
50
        :param store: The store the CHKMap is stored in.
51
        :param root_key: The root key of the map. None to create an empty
52
            CHKMap.
53
        """
54
        self._store = store
55
        if root_key is None:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
56
            self._root_node = LeafNode()
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
57
        else:
3735.2.41 by Robert Collins
Make the parent_id_basename index be updated during CHKInventory.apply_delta.
58
            self._root_node = self._node_key(root_key)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
59
60
    def apply_delta(self, delta):
61
        """Apply a delta to the map.
62
63
        :param delta: An iterable of old_key, new_key, new_value tuples.
64
            If new_key is not None, then new_key->new_value is inserted
65
            into the map; if old_key is not None, then the old mapping
66
            of old_key is removed.
67
        """
68
        for old, new, value in delta:
69
            if old is not None and old != new:
70
                # unmap
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
71
                self.unmap(old)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
72
        for old, new, value in delta:
73
            if new is not None:
74
                # map
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
75
                self.map(new, value)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
76
        return self._save()
77
78
    def _ensure_root(self):
79
        """Ensure that the root node is an object not a key."""
80
        if type(self._root_node) == tuple:
81
            # Demand-load the root
3735.2.31 by Robert Collins
CHKMap.iter_changes
82
            self._root_node = self._get_node(self._root_node)
83
84
    def _get_node(self, node):
85
        """Get a node.
86
87
        Node that this does not update the _items dict in objects containing a
88
        reference to this node. As such it does not prevent subsequent IO being
89
        performed.
90
        
91
        :param node: A tuple key or node object.
92
        :return: A node object.
93
        """
94
        if type(node) == tuple:
95
            bytes = self._read_bytes(node)
96
            return _deserialise(bytes, node)
97
        else:
98
            return node
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
99
100
    def _read_bytes(self, key):
101
        stream = self._store.get_record_stream([key], 'unordered', True)
102
        return stream.next().get_bytes_as('fulltext')
103
3735.9.2 by John Arbash Meinel
Add a _dump_tree helper that assists in debugging what is going on.
104
    def _dump_tree(self):
105
        """Return the tree in a string representation."""
106
        self._ensure_root()
107
        res = self._dump_tree_node(self._root_node, prefix='', indent='')
108
        return ''.join(res)
109
110
    def _dump_tree_node(self, node, prefix, indent):
111
        """For this node and all children, generate a string representation."""
112
        result = []
113
        result.append('%s%r %s %s\n' % (indent, prefix, node.__class__.__name__,
114
                                        node.key()[0]))
115
        if isinstance(node, InternalNode):
116
            # Trigger all child nodes to get loaded
117
            list(node._iter_nodes(self._store))
118
            for prefix, sub in node._items.iteritems():
119
                result.extend(self._dump_tree_node(sub, prefix, indent + '  '))
120
        else:
121
            for key, value in node._items.iteritems():
122
                result.append("%s  %r %r\n" % (indent, key, value))
123
        return result
124
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
125
    @classmethod
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
126
    def from_dict(klass, store, initial_value, maximum_size=0):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
127
        """Create a CHKMap in store with initial_value as the content.
128
        
129
        :param store: The store to record initial_value in, a VersionedFiles
130
            object with 1-tuple keys supporting CHK key generation.
131
        :param initial_value: A dict to store in store. Its keys and values
132
            must be bytestrings.
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
133
        :param maximum_size: The maximum_size rule to apply to nodes. This
134
            determines the size at which no new data is added to a single node.
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
135
        :return: The root chk of te resulting CHKMap.
136
        """
137
        result = CHKMap(store, None)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
138
        result._root_node.set_maximum_size(maximum_size)
139
        delta = []
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
140
        for key, value in initial_value.items():
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
141
            delta.append((None, key, value))
142
        result.apply_delta(delta)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
143
        return result._save()
144
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
145
    def iter_changes(self, basis):
146
        """Iterate over the changes between basis and self.
147
148
        :return: An iterator of tuples: (key, old_value, new_value). Old_value
149
            is None for keys only in self; new_value is None for keys only in
150
            basis.
151
        """
3735.2.31 by Robert Collins
CHKMap.iter_changes
152
        # Overview:
153
        # Read both trees in lexographic, highest-first order.
154
        # Any identical nodes we skip
155
        # Any unique prefixes we output immediately.
156
        # values in a leaf node are treated as single-value nodes in the tree
157
        # which allows them to be not-special-cased. We know to output them
158
        # because their value is a string, not a key(tuple) or node.
159
        #
160
        # corner cases to beware of when considering this function:
161
        # *) common references are at different heights.
162
        #    consider two trees:
163
        #    {'a': LeafNode={'aaa':'foo', 'aab':'bar'}, 'b': LeafNode={'b'}}
164
        #    {'a': InternalNode={'aa':LeafNode={'aaa':'foo', 'aab':'bar'}, 'ab':LeafNode={'ab':'bar'}}
165
        #     'b': LeafNode={'b'}}
166
        #    the node with aaa/aab will only be encountered in the second tree
167
        #    after reading the 'a' subtree, but it is encountered in the first
168
        #    tree immediately. Variations on this may have read internal nodes like this.
169
        #    we want to cut the entire pending subtree when we realise we have a common node.
170
        #    For this we use a list of keys - the path to a node - and check the entire path is 
171
        #    clean as we process each item.
172
        if self._node_key(self._root_node) == self._node_key(basis._root_node):
173
            return
174
        self._ensure_root()
175
        basis._ensure_root()
176
        excluded_keys = set()
177
        self_node = self._root_node
178
        basis_node = basis._root_node
179
        # A heap, each element is prefix, node(tuple/NodeObject/string),
180
        # key_path (a list of tuples, tail-sharing down the tree.)
181
        self_pending = []
182
        basis_pending = []
183
        def process_node(prefix, node, path, a_map, pending):
184
            # take a node and expand it
185
            node = a_map._get_node(node)
186
            if type(node) == LeafNode:
187
                path = (node._key, path)
188
                for key, value in node._items.items():
189
                    heapq.heappush(pending, ('\x00'.join(key), value, path))
190
            else:
191
                # type(node) == InternalNode
192
                path = (node._key, path)
193
                for prefix, child in node._items.items():
194
                    heapq.heappush(pending, (prefix, child, path))
195
        process_node(None, self_node, None, self, self_pending)
196
        process_node(None, basis_node, None, basis, basis_pending)
197
        self_seen = set()
198
        basis_seen = set()
199
        excluded_keys = set()
200
        def check_excluded(key_path):
201
            # Note that this is N^2, it depends on us trimming trees
202
            # aggressively to not become slow.
203
            # A better implementation would probably have a reverse map
204
            # back to the children of a node, and jump straight to it when 
205
            # a common node is detected, the proceed to remove the already
206
            # pending children. bzrlib.graph has a searcher module with a
207
            # similar problem.
208
            while key_path is not None:
209
                key, key_path = key_path
210
                if key in excluded_keys:
211
                    return True
212
            return False
213
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
214
        loop_counter = 0
3735.2.31 by Robert Collins
CHKMap.iter_changes
215
        while self_pending or basis_pending:
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
216
            loop_counter += 1
3735.2.31 by Robert Collins
CHKMap.iter_changes
217
            if not self_pending:
218
                # self is exhausted: output remainder of basis
219
                for prefix, node, path in basis_pending:
220
                    if check_excluded(path):
221
                        continue
222
                    node = basis._get_node(node)
223
                    if type(node) == str:
224
                        # a value
225
                        yield (tuple(prefix.split('\x00')), node, None)
226
                    else:
227
                        # subtree - fastpath the entire thing.
228
                        for key, value in node.iteritems(basis._store):
229
                            yield (key, value, None)
230
                return
231
            elif not basis_pending:
232
                # basis is exhausted: output remainder of self.
233
                for prefix, node, path in self_pending:
234
                    if check_excluded(path):
235
                        continue
236
                    node = self._get_node(node)
237
                    if type(node) == str:
238
                        # a value
239
                        yield (tuple(prefix.split('\x00')), None, node)
240
                    else:
241
                        # subtree - fastpath the entire thing.
242
                        for key, value in node.iteritems(self._store):
243
                            yield (key, None, value)
244
                return
245
            else:
246
                # XXX: future optimisation - yield the smaller items
247
                # immediately rather than pushing everything on/off the
248
                # heaps. Applies to both internal nodes and leafnodes.
249
                if self_pending[0][0] < basis_pending[0][0]:
250
                    # expand self
251
                    prefix, node, path = heapq.heappop(self_pending)
252
                    if check_excluded(path):
253
                        continue
254
                    if type(node) == str:
255
                        # a value
256
                        yield (tuple(prefix.split('\x00')), None, node)
257
                    else:
258
                        process_node(prefix, node, path, self, self_pending)
259
                        continue
260
                elif self_pending[0][0] > basis_pending[0][0]:
261
                    # expand basis
262
                    prefix, node, path = heapq.heappop(basis_pending)
263
                    if check_excluded(path):
264
                        continue
265
                    if type(node) == str:
266
                        # a value
267
                        yield (tuple(prefix.split('\x00')), node, None)
268
                    else:
269
                        process_node(prefix, node, path, basis, basis_pending)
270
                        continue
271
                else:
272
                    # common prefix: possibly expand both
273
                    if type(self_pending[0][1]) != str:
274
                        # process next self
275
                        read_self = True
276
                    else:
277
                        read_self = False
278
                    if type(basis_pending[0][1]) != str:
279
                        # process next basis
280
                        read_basis = True
281
                    else:
282
                        read_basis = False
283
                    if not read_self and not read_basis:
284
                        # compare a common value
285
                        self_details = heapq.heappop(self_pending)
286
                        basis_details = heapq.heappop(basis_pending)
287
                        if self_details[1] != basis_details[1]:
288
                            yield (tuple(self_details[0].split('\x00')),
289
                                basis_details[1], self_details[1])
290
                        continue
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
291
                    # At least one side wasn't a string.
292
                    if (self._node_key(self_pending[0][1]) ==
293
                        self._node_key(basis_pending[0][1])):
294
                        # Identical pointers, skip (and don't bother adding to
295
                        # excluded, it won't turn up again.
296
                        heapq.heappop(self_pending)
297
                        heapq.heappop(basis_pending)
298
                        continue
299
                    # Now we need to expand this node before we can continue
3735.2.31 by Robert Collins
CHKMap.iter_changes
300
                    if read_self:
301
                        prefix, node, path = heapq.heappop(self_pending)
302
                        if check_excluded(path):
303
                            continue
304
                        process_node(prefix, node, path, self, self_pending)
305
                    if read_basis:
306
                        prefix, node, path = heapq.heappop(basis_pending)
307
                        if check_excluded(path):
308
                            continue
309
                        process_node(prefix, node, path, basis, basis_pending)
3735.2.32 by Robert Collins
Activate test for common node skipping. - 50 times performance improvement.
310
        # print loop_counter
3735.2.30 by Robert Collins
Start iter_changes between CHKMap instances.
311
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
312
    def iteritems(self, key_filter=None):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
313
        """Iterate over the entire CHKMap's contents."""
314
        self._ensure_root()
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
315
        return self._root_node.iteritems(self._store, key_filter=key_filter)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
316
3735.2.12 by Robert Collins
Implement commit-via-deltas for split inventory repositories.
317
    def key(self):
318
        """Return the key for this map."""
319
        if isinstance(self._root_node, tuple):
320
            return self._root_node
321
        else:
322
            return self._root_node._key
323
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
324
    def __len__(self):
325
        self._ensure_root()
326
        return len(self._root_node)
327
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
328
    def map(self, key, value):
329
        """Map a key tuple to value."""
330
        # Need a root object.
331
        self._ensure_root()
332
        prefix, node_details = self._root_node.map(self._store, key, value)
333
        if len(node_details) == 1:
334
            self._root_node = node_details[0][1]
335
        else:
336
            self._root_node = InternalNode()
337
            self._root_node.set_maximum_size(node_details[0][1].maximum_size)
338
            self._root_node._key_width = node_details[0][1]._key_width
339
            for split, node in node_details:
340
                self._root_node.add_node(split, node)
341
3735.2.31 by Robert Collins
CHKMap.iter_changes
342
    def _node_key(self, node):
343
        """Get the key for a node whether its a tuple o r node."""
344
        if type(node) == tuple:
345
            return node
346
        else:
347
            return node._key
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
348
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
349
    def unmap(self, key):
350
        """remove key from the map."""
351
        self._ensure_root()
352
        self._root_node.unmap(self._store, key)
353
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
354
    def _save(self):
355
        """Save the map completely.
356
357
        :return: The key of the root node.
358
        """
359
        if type(self._root_node) == tuple:
360
            # Already saved.
361
            return self._root_node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
362
        keys = list(self._root_node.serialise(self._store))
363
        return keys[-1]
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
364
365
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
366
class Node(object):
367
    """Base class defining the protocol for CHK Map nodes."""
368
369
    def __init__(self, key_width=1):
370
        """Create a node.
371
372
        :param key_width: The width of keys for this node.
373
        """
374
        self._key = None
375
        # Current number of elements
376
        self._len = 0
377
        self._maximum_size = 0
378
        self._key_width = 1
379
        # current size in bytes
380
        self._size = 0
381
        # The pointers/values this node has - meaning defined by child classes.
382
        self._items = {}
383
3735.2.38 by Robert Collins
Sufficient fixes to allow bzr-search to index a dev3 format repository.
384
    def key(self):
385
        return self._key
386
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
387
    def __len__(self):
388
        return self._len
389
390
    @property
391
    def maximum_size(self):
392
        """What is the upper limit for adding references to a node."""
393
        return self._maximum_size
394
395
    def set_maximum_size(self, new_size):
396
        """Set the size threshold for nodes.
397
398
        :param new_size: The size at which no data is added to a node. 0 for
399
            unlimited.
400
        """
401
        self._maximum_size = new_size
402
403
404
class LeafNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
405
    """A node containing actual key:value pairs.
406
    
407
    :ivar _items: A dict of key->value items. The key is in tuple form.
408
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
409
410
    def __init__(self):
411
        Node.__init__(self)
412
        # The size of a leaf node with default values and no children.
413
        self._size = 12
414
415
    def _current_size(self):
416
        """Answer the current serialised size of this node."""
417
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
418
            len(str(self._maximum_size)))
419
420
    @classmethod
421
    def deserialise(klass, bytes, key):
422
        """Deserialise bytes, with key key, into a LeafNode.
423
424
        :param bytes: The bytes of the node.
425
        :param key: The key that the serialised node has.
426
        """
427
        result = LeafNode()
428
        lines = bytes.splitlines()
429
        items = {}
430
        if lines[0] != 'chkleaf:':
431
            raise ValueError("not a serialised leaf node: %r" % bytes)
432
        maximum_size = int(lines[1])
433
        width = int(lines[2])
434
        length = int(lines[3])
435
        for line in lines[4:]:
3735.2.25 by Robert Collins
CHKInventory core tests passing.
436
            elements = line.split('\x00', width)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
437
            items[tuple(elements[:-1])] = elements[-1]
438
        if len(items) != length:
439
            raise AssertionError("item count mismatch")
440
        result._items = items
441
        result._len = length
442
        result._maximum_size = maximum_size
443
        result._key = key
444
        result._key_width = width
445
        result._size = len(bytes)
446
        return result
447
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
448
    def iteritems(self, store, key_filter=None):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
449
        if key_filter is not None:
450
            for item in self._items.iteritems():
451
                if item[0] in key_filter:
452
                    yield item
453
        else:
454
            for item in self._items.iteritems():
455
                yield item
456
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
457
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
458
        """Map key to value."""
459
        if key in self._items:
460
            self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
461
            self._len -= 1
462
        self._items[key] = value
463
        self._size += 2 + len('\x00'.join(key)) + len(value)
464
        self._len += 1
465
        self._key = None
466
        if (self._maximum_size and self._current_size() > self._maximum_size and
467
            self._len > 1):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
468
            common_prefix = self.unique_serialised_prefix()
469
            split_at = len(common_prefix) + 1
470
            result = {}
471
            for key, value in self._items.iteritems():
472
                serialised_key = self._serialised_key(key)
473
                prefix = serialised_key[:split_at]
474
                if prefix not in result:
475
                    node = LeafNode()
476
                    node.set_maximum_size(self._maximum_size)
477
                    node._key_width = self._key_width
478
                    result[prefix] = node
479
                else:
480
                    node = result[prefix]
481
                node.map(store, key, value)
482
            return common_prefix, result.items()
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
483
        else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
484
            return self.unique_serialised_prefix(), [("", self)]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
485
486
    def serialise(self, store):
487
        """Serialise the tree to store.
488
489
        :param store: A VersionedFiles honouring the CHK extensions.
490
        :return: An iterable of the keys inserted by this operation.
491
        """
492
        lines = ["chkleaf:\n"]
493
        lines.append("%d\n" % self._maximum_size)
494
        lines.append("%d\n" % self._key_width)
495
        lines.append("%d\n" % self._len)
496
        for key, value in sorted(self._items.items()):
497
            lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
498
        sha1, _, _ = store.add_lines((None,), (), lines)
499
        self._key = ("sha1:" + sha1,)
500
        return [self._key]
501
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
502
    def _serialised_key(self, key):
503
        """Return the serialised key for key in this node."""
504
        return '\x00'.join(key)
505
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
506
    def refs(self):
507
        """Return the references to other CHK's held by this node."""
508
        return []
509
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
510
    def unique_serialised_prefix(self):
511
        """Return the unique key prefix for this node.
512
513
        :return: A bytestring of the longest serialised key prefix that is
514
            unique within this node.
515
        """
516
        # may want to cache this eventually :- but wait for enough
517
        # functionality to profile.
518
        keys = list(self._items.keys())
519
        if not keys:
520
            return ""
521
        current_prefix = self._serialised_key(keys.pop(-1))
522
        while current_prefix and keys:
523
            next_key = self._serialised_key(keys.pop(-1))
524
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
525
                if left != right:
526
                    pos -= 1
527
                    break
528
            current_prefix = current_prefix[:pos + 1]
529
        return current_prefix
530
531
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
532
        """Unmap key from the node."""
533
        self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
534
        self._len -= 1
535
        del self._items[key]
536
        self._key = None
537
        return self
538
539
540
class InternalNode(Node):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
541
    """A node that contains references to other nodes.
542
    
543
    An InternalNode is responsible for mapping serialised key prefixes to child
544
    nodes. It is greedy - it will defer splitting itself as long as possible.
545
    """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
546
547
    def __init__(self):
548
        Node.__init__(self)
549
        # The size of an internalnode with default values and no children.
550
        # self._size = 12
551
        # How many octets key prefixes within this node are.
552
        self._node_width = 0
553
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
554
    def add_node(self, prefix, node):
555
        """Add a child node with prefix prefix, and node node.
556
557
        :param prefix: The serialised key prefix for node.
558
        :param node: The node being added.
559
        """
560
        self._len += len(node)
561
        if not len(self._items):
562
            self._node_width = len(prefix)
563
        self._items[prefix] = node
564
        self._key = None
565
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
566
    def _current_size(self):
567
        """Answer the current serialised size of this node."""
568
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
569
            len(str(self._maximum_size)))
570
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
571
    @classmethod
572
    def deserialise(klass, bytes, key):
3735.2.25 by Robert Collins
CHKInventory core tests passing.
573
        """Deserialise bytes to an InternalNode, with key key.
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
574
575
        :param bytes: The bytes of the node.
576
        :param key: The key that the serialised node has.
577
        :return: An InternalNode instance.
578
        """
579
        result = InternalNode()
580
        lines = bytes.splitlines()
581
        items = {}
582
        if lines[0] != 'chknode:':
583
            raise ValueError("not a serialised internal node: %r" % bytes)
584
        maximum_size = int(lines[1])
585
        width = int(lines[2])
586
        length = int(lines[3])
587
        for line in lines[4:]:
588
            prefix, flat_key = line.rsplit('\x00', 1)
589
            items[prefix] = (flat_key,)
590
        result._items = items
591
        result._len = length
592
        result._maximum_size = maximum_size
593
        result._key = key
594
        result._key_width = width
595
        result._size = len(bytes)
596
        result._node_width = len(prefix)
597
        return result
598
599
    def iteritems(self, store, key_filter=None):
600
        for node in self._iter_nodes(store, key_filter=key_filter):
601
            for item in node.iteritems(store, key_filter=key_filter):
602
                yield item
603
604
    def _iter_nodes(self, store, key_filter=None):
605
        """Iterate over node objects which match key_filter.
606
607
        :param store: A store to use for accessing content.
608
        :param key_filter: A key filter to filter nodes. Only nodes that might
609
            contain a key in key_filter will be returned.
610
        :return: An iterable of nodes.
611
        """
612
        nodes = []
3735.2.31 by Robert Collins
CHKMap.iter_changes
613
        keys = {}
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
614
        if key_filter is None:
3735.2.31 by Robert Collins
CHKMap.iter_changes
615
            for prefix, node in self._items.iteritems():
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
616
                if type(node) == tuple:
3735.2.31 by Robert Collins
CHKMap.iter_changes
617
                    keys[node] = prefix
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
618
                else:
619
                    nodes.append(node)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
620
        else:
621
            serialised_filter = set([self._serialised_key(key) for key in
622
                key_filter])
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
623
            for prefix, node in self._items.iteritems():
624
                if prefix in serialised_filter:
625
                    if type(node) == tuple:
3735.2.31 by Robert Collins
CHKMap.iter_changes
626
                        keys[node] = prefix
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
627
                    else:
628
                        nodes.append(node)
629
        if keys:
630
            # demand load some pages.
631
            stream = store.get_record_stream(keys, 'unordered', True)
632
            for record in stream:
633
                node = _deserialise(record.get_bytes_as('fulltext'), record.key)
634
                nodes.append(node)
3735.2.31 by Robert Collins
CHKMap.iter_changes
635
                self._items[keys[record.key]] = node
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
636
        return nodes
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
637
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
638
    def map(self, store, key, value):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
639
        """Map key to value."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
640
        if not len(self._items):
641
            raise AssertionError("cant map in an empty InternalNode.")
642
        children = self._iter_nodes(store, key_filter=[key])
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
643
        serialised_key = self._serialised_key(key)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
644
        if children:
645
            child = children[0]
646
        else:
647
            # new child needed:
648
            child = self._new_child(serialised_key, LeafNode)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
649
        old_len = len(child)
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
650
        prefix, node_details = child.map(store, key, value)
651
        if len(node_details) == 1:
652
            # child may have shrunk, or might be the same.
653
            self._len = self._len - old_len + len(child)
654
            self._items[serialised_key] = child
3735.2.29 by Robert Collins
Untested code is broken code.
655
            self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
656
            return self.unique_serialised_prefix(), [("", self)]
657
        # child has overflown - create a new intermediate node.
658
        # XXX: This is where we might want to try and expand our depth
659
        # to refer to more bytes of every child (which would give us
660
        # multiple pointers to child nodes, but less intermediate nodes)
661
        child = self._new_child(serialised_key, InternalNode)
662
        for split, node in node_details:
663
            child.add_node(split, node)
664
        self._len = self._len - old_len + len(child)
3735.2.29 by Robert Collins
Untested code is broken code.
665
        self._key = None
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
666
        return self.unique_serialised_prefix(), [("", self)]
667
668
    def _new_child(self, serialised_key, klass):
669
        """Create a new child node of type klass."""
670
        child = klass()
671
        child.set_maximum_size(self._maximum_size)
672
        child._key_width = self._key_width
673
        self._items[serialised_key] = child
674
        return child
675
676
    def serialise(self, store):
677
        """Serialise the node to store.
678
679
        :param store: A VersionedFiles honouring the CHK extensions.
680
        :return: An iterable of the keys inserted by this operation.
681
        """
682
        for node in self._items.itervalues():
683
            if type(node) == tuple:
684
                # Never deserialised.
685
                continue
686
            if node._key is not None:
687
                # Never altered
688
                continue
689
            for key in node.serialise(store):
690
                yield key
691
        lines = ["chknode:\n"]
692
        lines.append("%d\n" % self._maximum_size)
693
        lines.append("%d\n" % self._key_width)
694
        lines.append("%d\n" % self._len)
695
        for prefix, node in sorted(self._items.items()):
696
            if type(node) == tuple:
697
                key = node[0]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
698
            else:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
699
                key = node._key[0]
700
            lines.append("%s\x00%s\n" % (prefix, key))
701
        sha1, _, _ = store.add_lines((None,), (), lines)
702
        self._key = ("sha1:" + sha1,)
703
        yield self._key
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
704
705
    def _serialised_key(self, key):
706
        """Return the serialised key for key in this node."""
707
        return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
708
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
709
    def _split(self, offset):
710
        """Split this node into smaller nodes starting at offset.
711
712
        :param offset: The offset to start the new child nodes at.
713
        :return: An iterable of (prefix, node) tuples. prefix is a byte
714
            prefix for reaching node.
715
        """
716
        if offset >= self._node_width:
717
            for node in self._items.values():
718
                for result in node._split(offset):
719
                    yield result
720
            return
721
        for key, node in self._items.items():
722
            pass
723
3735.2.26 by Robert Collins
CHKInventory migrated to new CHKMap code.
724
    def refs(self):
725
        """Return the references to other CHK's held by this node."""
726
        if self._key is None:
727
            raise AssertionError("unserialised nodes have no refs.")
728
        refs = []
729
        for value in self._items.itervalues():
730
            if type(value) == tuple:
731
                refs.append(value)
732
            else:
733
                refs.append(value.key())
734
        return refs
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
735
736
    def unique_serialised_prefix(self):
737
        """Return the unique key prefix for this node.
738
739
        :return: A bytestring of the longest serialised key prefix that is
740
            unique within this node.
741
        """
742
        # may want to cache this eventually :- but wait for enough
743
        # functionality to profile.
744
        keys = list(self._items.keys())
745
        if not keys:
746
            return ""
747
        current_prefix = keys.pop(-1)
748
        while current_prefix and keys:
749
            next_key = keys.pop(-1)
750
            for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
751
                if left != right:
752
                    pos -= 1
753
                    break
754
            current_prefix = current_prefix[:pos + 1]
755
        return current_prefix
756
757
    def unmap(self, store, key):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
758
        """Remove key from this node and it's children."""
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
759
        if not len(self._items):
760
            raise AssertionError("cant unmap in an empty InternalNode.")
761
        serialised_key = self._serialised_key(key)
762
        children = self._iter_nodes(store, key_filter=[key])
763
        serialised_key = self._serialised_key(key)
764
        if children:
765
            child = children[0]
766
        else:
767
            raise KeyError(key)
768
        self._len -= 1
769
        unmapped = child.unmap(store, key)
770
        if len(unmapped) == 0:
771
            # All child nodes are gone, remove the child:
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
772
            del self._items[serialised_key]
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
773
        else:
774
            # Stash the returned node
775
            self._items[serialised_key] = unmapped
3735.2.23 by Robert Collins
Test unmapping with one child left but multiple keys.
776
        if len(self._items) == 1:
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
777
            # this node is no longer needed:
778
            return self._items.values()[0]
3735.2.29 by Robert Collins
Untested code is broken code.
779
        self._key = None
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
780
        return self
781
782
3735.2.16 by Robert Collins
Untested extensions to support repodetails
783
def _deserialise(bytes, key):
784
    """Helper for repositorydetails - convert bytes to a node."""
3735.2.24 by Robert Collins
test_chk_map tests all passing.
785
    if bytes.startswith("chkleaf:\n"):
3735.2.21 by Robert Collins
BROKEN: multi level CHKMap tries, unfinished, subsystem in flux.
786
        return LeafNode.deserialise(bytes, key)
787
    elif bytes.startswith("chknode:\n"):
788
        return InternalNode.deserialise(bytes, key)
3735.2.16 by Robert Collins
Untested extensions to support repodetails
789
    else:
790
        raise AssertionError("Unknown node type.")
3735.9.1 by John Arbash Meinel
Start working on an iter_interesting_nodes, which can find nodes to transmit
791
792
793
def iter_interesting_nodes(store, interesting_root_keys,
794
                           uninteresting_root_keys):
795
    """Given root keys, find interesting nodes.
796
797
    Evaluate nodes referenced by interesting_root_keys. Ones that are also
798
    referenced from uninteresting_root_keys are not considered interesting.
799
800
    :param interesting_root_keys: keys which should be part of the
801
        "interesting" nodes (which will be yielded)
802
    :param uninteresting_root_keys: keys which should be filtered out of the
803
        result set.
804
    :return: Yield
805
        (interesting records, interesting chk's, interesting key:values)
806
    """
807
    uninteresting_keys = set(uninteresting_root_keys)
808
    interesting_keys = set(interesting_root_keys)
809
    # What about duplicates with uninteresting_root_keys?
810
    interesting_chks = set(interesting_keys)
811
    # TODO: consider that it may be more memory efficient to use the 20-byte
812
    #       sha1 string, rather than tuples of hexidecimal sha1 strings.
813
    uninteresting_chks = set(uninteresting_keys)
814
    uninteresting_key_values = set()
815
816
    # XXX: First attempt, UGLY, UGLY, UGLY
817
    # First, find the full set of uninteresting bits reachable by the
818
    # uninteresting roots
819
    chks_to_read = uninteresting_keys
820
    while chks_to_read:
821
        next_chks = set()
822
        for record in store.get_record_stream(chks_to_read, 'unordered', True):
823
            # TODO: Handle 'absent'
824
            node = _deserialise(record.get_bytes_as('fulltext'), record.key)
825
            if isinstance(node, InternalNode):
826
                # uninteresting_prefix_chks.update(node._items.iteritems())
827
                chks = node._items.values()
828
                # TODO: We remove the entries that are already in
829
                #       uninteresting_chks ?
830
                next_chks.update(chks)
831
                uninteresting_chks.update(chks)
832
            else:
833
                uninteresting_key_values.update(node._items.iteritems())
834
        chks_to_read = next_chks
835
836
    # Is it possible that we would need to filter out the references we know to
837
    # be uninteresting, eg: interesting_keys.difference(uninteresting_chks)
838
    chks_to_read = interesting_keys
839
    while chks_to_read:
840
        next_chks = set()
841
        records = {}
842
        interesting_items = []
843
        interesting_chks = set()
844
        for record in store.get_record_stream(chks_to_read, 'unordered', True):
845
            records[record.key] = record
846
            # TODO: Handle 'absent'
847
            node = _deserialise(record.get_bytes_as('fulltext'), record.key)
848
            if isinstance(node, InternalNode):
849
                chks = [chk for chk in node._items.itervalues()
850
                             if chk not in uninteresting_chks]
851
                next_chks.update(chks)
852
                # These are now uninteresting everywhere else
853
                uninteresting_chks.update(chks)
854
            else:
855
                interesting_items = [item for item in node._items.iteritems()
856
                                     if item not in uninteresting_key_values]
857
                uninteresting_key_values.update(interesting_items)
858
        yield records, chks_to_read, interesting_items
859
        chks_to_read = next_chks