/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Persistent maps from string->string using CHK stores."""
18
19
import osutils
20
21
22
class CHKMap(object):
23
    """A persistent map from string to string backed by a CHK store."""
24
25
    def __init__(self, store, root_key):
26
        """Create a CHKMap object.
27
28
        :param store: The store the CHKMap is stored in.
29
        :param root_key: The root key of the map. None to create an empty
30
            CHKMap.
31
        """
32
        self._store = store
33
        if root_key is None:
34
            self._root_node = RootNode()
35
        else:
36
            self._root_node = root_key
37
38
    def apply_delta(self, delta):
39
        """Apply a delta to the map.
40
41
        :param delta: An iterable of old_key, new_key, new_value tuples.
42
            If new_key is not None, then new_key->new_value is inserted
43
            into the map; if old_key is not None, then the old mapping
44
            of old_key is removed.
45
        """
46
        for old, new, value in delta:
47
            if old is not None and old != new:
48
                # unmap
49
                self._unmap(old)
50
        for old, new, value in delta:
51
            if new is not None:
52
                # map
53
                self._map(new, value)
54
        return self._save()
55
56
    def _ensure_root(self):
57
        """Ensure that the root node is an object not a key."""
58
        if type(self._root_node) == tuple:
59
            # Demand-load the root
60
            bytes = self._read_bytes(self._root_node)
61
            root_key = self._root_node
62
            self._root_node = RootNode()
63
            self._root_node.deserialise(bytes, root_key)
64
65
    def _read_bytes(self, key):
66
        stream = self._store.get_record_stream([key], 'unordered', True)
67
        return stream.next().get_bytes_as('fulltext')
68
69
    @classmethod
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
70
    def from_dict(klass, store, initial_value, maximum_size=0):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
71
        """Create a CHKMap in store with initial_value as the content.
72
        
73
        :param store: The store to record initial_value in, a VersionedFiles
74
            object with 1-tuple keys supporting CHK key generation.
75
        :param initial_value: A dict to store in store. Its keys and values
76
            must be bytestrings.
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
77
        :param maximum_size: The maximum_size rule to apply to nodes. This
78
            determines the size at which no new data is added to a single node.
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
79
        :return: The root chk of te resulting CHKMap.
80
        """
81
        result = CHKMap(store, None)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
82
        result._root_node.set_maximum_size(maximum_size)
83
        delta = []
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
84
        for key, value in initial_value.items():
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
85
            delta.append((None, key, value))
86
        result.apply_delta(delta)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
87
        return result._save()
88
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
89
    def iteritems(self, key_filter=None):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
90
        """Iterate over the entire CHKMap's contents."""
91
        self._ensure_root()
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
92
        if key_filter is not None:
93
            for name, key, in self._root_node._nodes.iteritems():
94
                if name in key_filter:
95
                    bytes = self._read_bytes(key)
96
                    yield name, ValueNode.deserialise(bytes).value
97
        else:
98
            for name, key, in self._root_node._nodes.iteritems():
99
                bytes = self._read_bytes(key)
100
                yield name, ValueNode.deserialise(bytes).value
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
101
3735.2.12 by Robert Collins
Implement commit-via-deltas for split inventory repositories.
102
    def key(self):
103
        """Return the key for this map."""
104
        if isinstance(self._root_node, tuple):
105
            return self._root_node
106
        else:
107
            return self._root_node._key
108
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
109
    def __len__(self):
110
        self._ensure_root()
111
        return len(self._root_node)
112
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
113
    def _map(self, key, value):
114
        """Map key to value."""
115
        self._ensure_root()
116
        # Store the value
117
        bytes = ValueNode(value).serialise()
3735.2.15 by Robert Collins
More direct implementation of fetch between different serializers.
118
        # Experimental code to probe for keys rather than just adding; its not
119
        # clear if it is an improvement.
120
        #chk = ("sha1:%s" % osutils.sha_string(bytes),)
121
        #if not self._store.get_parent_map([key]):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
122
        sha1, _, _ = self._store.add_lines((None,), (), osutils.split_lines(bytes))
3735.2.15 by Robert Collins
More direct implementation of fetch between different serializers.
123
        chk = ("sha1:" + sha1,)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
124
        # And link into the root
3735.2.15 by Robert Collins
More direct implementation of fetch between different serializers.
125
        self._root_node.add_child(key, chk)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
126
127
    def _unmap(self, key):
128
        """remove key from the map."""
129
        self._ensure_root()
130
        self._root_node.remove_child(key)
131
132
    def _save(self):
133
        """Save the map completely.
134
135
        :return: The key of the root node.
136
        """
137
        if type(self._root_node) == tuple:
138
            # Already saved.
139
            return self._root_node
140
        # TODO: flush root_nodes children?
141
        bytes = self._root_node.serialise()
142
        sha1, _, _ = self._store.add_lines((None,), (),
143
            osutils.split_lines(bytes))
144
        result = ("sha1:" + sha1,)
145
        self._root_node._key = result
146
        return result
147
148
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
149
class Node(object):
150
    """Base class defining the protocol for CHK Map nodes."""
151
152
    def __init__(self, key_width=1):
153
        """Create a node.
154
155
        :param key_width: The width of keys for this node.
156
        """
157
        self._key = None
158
        # Current number of elements
159
        self._len = 0
160
        self._maximum_size = 0
161
        self._key_width = 1
162
        # current size in bytes
163
        self._size = 0
164
        # The pointers/values this node has - meaning defined by child classes.
165
        self._items = {}
166
167
    def __len__(self):
168
        return self._len
169
170
    @property
171
    def maximum_size(self):
172
        """What is the upper limit for adding references to a node."""
173
        return self._maximum_size
174
175
    def set_maximum_size(self, new_size):
176
        """Set the size threshold for nodes.
177
178
        :param new_size: The size at which no data is added to a node. 0 for
179
            unlimited.
180
        """
181
        self._maximum_size = new_size
182
183
184
class LeafNode(Node):
185
    """A node containing actual key:value pairs."""
186
187
    def __init__(self):
188
        Node.__init__(self)
189
        # The size of a leaf node with default values and no children.
190
        self._size = 12
191
192
    def _current_size(self):
193
        """Answer the current serialised size of this node."""
194
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
195
            len(str(self._maximum_size)))
196
197
    @classmethod
198
    def deserialise(klass, bytes, key):
199
        """Deserialise bytes, with key key, into a LeafNode.
200
201
        :param bytes: The bytes of the node.
202
        :param key: The key that the serialised node has.
203
        """
204
        result = LeafNode()
205
        lines = bytes.splitlines()
206
        items = {}
207
        if lines[0] != 'chkleaf:':
208
            raise ValueError("not a serialised leaf node: %r" % bytes)
209
        maximum_size = int(lines[1])
210
        width = int(lines[2])
211
        length = int(lines[3])
212
        for line in lines[4:]:
213
            elements = line.split('\x00')
214
            items[tuple(elements[:-1])] = elements[-1]
215
        if len(items) != length:
216
            raise AssertionError("item count mismatch")
217
        result._items = items
218
        result._len = length
219
        result._maximum_size = maximum_size
220
        result._key = key
221
        result._key_width = width
222
        result._size = len(bytes)
223
        return result
224
225
    def iteritems(self, key_filter=None):
226
        if key_filter is not None:
227
            for item in self._items.iteritems():
228
                if item[0] in key_filter:
229
                    yield item
230
        else:
231
            for item in self._items.iteritems():
232
                yield item
233
234
    def key(self):
235
        return self._key
236
237
    def map(self, key, value):
238
        """Map key to value."""
239
        if key in self._items:
240
            self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
241
            self._len -= 1
242
        self._items[key] = value
243
        self._size += 2 + len('\x00'.join(key)) + len(value)
244
        self._len += 1
245
        self._key = None
246
        if (self._maximum_size and self._current_size() > self._maximum_size and
247
            self._len > 1):
248
            result = InternalNode()
249
            result.set_maximum_size(self._maximum_size)
250
            result._key_width = self._key_width
251
            result._add_node(self)
252
            return result
253
        else:
254
            return self
255
256
    def serialise(self, store):
257
        """Serialise the tree to store.
258
259
        :param store: A VersionedFiles honouring the CHK extensions.
260
        :return: An iterable of the keys inserted by this operation.
261
        """
262
        lines = ["chkleaf:\n"]
263
        lines.append("%d\n" % self._maximum_size)
264
        lines.append("%d\n" % self._key_width)
265
        lines.append("%d\n" % self._len)
266
        for key, value in sorted(self._items.items()):
267
            lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
268
        sha1, _, _ = store.add_lines((None,), (), lines)
269
        self._key = ("sha1:" + sha1,)
270
        return [self._key]
271
272
    def unmap(self, key):
273
        """Unmap key from the node."""
274
        self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
275
        self._len -= 1
276
        del self._items[key]
277
        self._key = None
278
        return self
279
280
281
class InternalNode(Node):
282
    """A node that contains references to other nodes."""
283
284
    def __init__(self):
285
        Node.__init__(self)
286
        # The size of an internalnode with default values and no children.
287
        # self._size = 12
288
        # How many octets key prefixes within this node are.
289
        self._node_width = 0
290
291
    def _add_node(self, node):
292
        """Add a node to the InternalNode.
293
294
        :param node: An existing node to add. The node will be examined to see
295
            if it is over or undersized and rebalanced if needed across this
296
            nodes children.
297
        """
298
        if self._len == 0:
299
            # new tree level, we're being populated by upspill from a overfull
300
            # tree.
301
            # Cheap-to-code-but-slow?
302
            elements = {}
303
            max_width = 0
304
            # suck in all the values
305
            for key, value in node.iteritems():
306
                # We work on the serialised keys
307
                serialised_key = '\x00'.join(key)
308
                elements[serialised_key] = (key, value)
309
                max_width = max(len(serialised_key), max_width)
310
            # Determine the maximum common key width we will internally handle.
311
            # Start with the full key width; if that exceeds our node size
312
            # shrink it until we are within the node limit.
313
            self._node_width = max_width
314
            width = self._node_width
315
            # Populate all the resulting keys:
316
            items = self._items
317
            for serialised_key, key_value in elements.iteritems():
318
                actual_key = self._serialised_key(key_value[0])
319
                child = items.get(actual_key, None)
320
                if not child:
321
                    child = LeafNode()
322
                    child.set_maximum_size(self._maximum_size)
323
                    child._key_width = self._key_width
324
                    items[actual_key] = child
325
                child.map(key_value[0], key_value[1])
326
                self._len += 1
327
        else:
328
            raise NotImplementedError(self._add_node)
329
330
    def _current_size(self):
331
        """Answer the current serialised size of this node."""
332
        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
333
            len(str(self._maximum_size)))
334
335
    def iteritems(self, key_filter=None):
336
        if key_filter is None:
337
            for child in self._items.itervalues():
338
                for item in child.iteritems():
339
                    yield item
340
        else:
341
            serialised_filter = set([self._serialised_key(key) for key in
342
                key_filter])
343
            for key, child in self._items.iteritems():
344
                if key in serialised_filter:
345
                    for item in child.iteritems(key_filter):
346
                        yield item
347
348
    def map(self, key, value):
349
        """Map key to value."""
350
        serialised_key = self._serialised_key(key)
351
        try:
352
            child = self._items[serialised_key]
353
        except KeyError:
354
            child = LeafNode()
355
            child.set_maximum_size(self._maximum_size)
356
            child._key_width = self._key_width
357
            self._items[serialised_key] = child
358
        old_len = len(child)
359
        new_child = child.map(key, value)
360
        # TODO: rebalance/enforce balance
361
        if new_child is not child:
362
            # The child has exceeded its size; if we take more bytes off the
363
            # key prefix for the child, that may fit into our node;
364
            # How many more bytes can we fit?
365
            remaining_size = max(0, self.maximum_size - self._current_size())
366
            size_per_row = (self._node_width + 45 + 2)
367
            # without increasing the key width:
368
            extra_rows = remaining_size / size_per_row
369
            if extra_rows:
370
                # What is the minimum node width increase to split new_child:
371
                offset_bytes = [1]
372
                offset = self._node_width - 1
373
                while len(offset_bytes) == 1 and offset < new_child._node_width:
374
                    offset += 1
375
                    offset_bytes = set(child_key[offset] for child_key in
376
                        new_child._items.keys())
377
                if len(offset_bytes) > 1:
378
                    # We've found the fan out point
379
                    increase = self._node_width - offset
380
                    # calculate how many more pointers we need to carry
381
                    new_keys = len(offset_bytes)
382
                    for subnode in self._items.values():
383
                        new_keys += subnode._key_count(self._node_width, offset)
384
                    if (new_keys * (offset + 45 + 2) +
385
                        self._prelude_size() > self._maximum_size):
386
                        # can't fit it all, accept the new child
387
                        self._items[serialised_key] = new_child
388
                    else:
389
                        # increasing the 
390
                        pass
391
                else:
392
                    # it didn't fan out! wtf!
393
                    raise AssertionError("no fan out")
394
            else:
395
                # leave it split
396
                self._items[serialised_key] = new_child
397
        self._len += 1
398
        return self
399
400
    def _serialised_key(self, key):
401
        """Return the serialised key for key in this node."""
402
        return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
403
404
    def _key_count(self, parent_width, cut_width):
405
        """Return the number of keys in/under this node between two widths.
406
407
        :param parent_width: The start offset in keys to consider.
408
        :param cut_width: The width to stop considering at.
409
        """
410
        if cut_width > self._node_width:
411
            raise NotImplementedError(self._key_count)
412
        # Generate a list of unique substrings
413
414
415
    def unmap(self, key):
416
        """Remove key from this node and it's children."""
417
        serialised_key = self._serialised_key(key)
418
        child = self._items[serialised_key]
419
        new_child = child.unmap(key)
420
        # TODO shrink/rebalance
421
        if not len(new_child):
422
            del self._items[serialised_key]
423
            if len(self._items) == 1:
424
                return self._items.values()[0]
425
        elif new_child is not child:
426
            self._items[serialised_key] = new_child
427
        return self
428
429
430
class RootNode(Node):
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
431
    """A root node in a CHKMap."""
432
433
    def __init__(self):
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
434
        Node.__init__(self)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
435
        self._nodes = {}
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
436
        self._size = 12
437
        self.prefix_width = 0
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
438
439
    def add_child(self, name, child):
440
        """Add a child to the node.
441
442
        If the node changes, it's key is reset.
443
444
        :param name: The name of the child. A bytestring.
445
        :param child: The child, a key tuple for the childs value.
446
        """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
447
        if self._maximum_size and self._current_size() >= self._maximum_size:
448
            return False
449
        if name in self._nodes:
450
            self.remove_child(name)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
451
        self._nodes[name] = child
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
452
        self._len += 1
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
453
        self._key = None
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
454
        self._size += len(name) + len(child[0]) + 2
455
        return True
456
457
    def _current_size(self):
458
        """Answer the current serialised size of this node."""
459
        return (self._size + len(str(self._maximum_size)) + len(str(self._len))
460
            + len(str(self.prefix_width)))
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
461
462
    def deserialise(self, bytes, key):
463
        """Set the nodes value to that contained in bytes.
464
        
465
        :param bytes: The bytes of the node.
466
        :param key: The key that the serialised node has.
467
        """
468
        lines = bytes.splitlines()
469
        nodes = {}
470
        if lines[0] != 'chkroot:':
471
            raise ValueError("not a serialised root node: %r" % bytes)
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
472
        maximum_size = int(lines[1])
473
        prefix_width = int(lines[2])
474
        length = int(lines[3])
475
        for line in lines[4:]:
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
476
            name, value = line.split('\x00')
477
            nodes[name] = (value,)
478
        self._nodes = nodes
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
479
        self._len = length
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
480
        self._maximum_size = maximum_size
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
481
        self._key = key
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
482
        self.prefix_width = prefix_width
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
483
3735.2.9 by Robert Collins
Get a working chk_map using inventory implementation bootstrapped.
484
    def refs(self):
485
        """Get the CHK key references this node holds."""
486
        return self._nodes.values()
487
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
488
    def remove_child(self, name):
489
        """Remove name from the node.
490
491
        If the node changes, it's key is reset.
492
493
        :param name: The name to remove from the node.
494
        """
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
495
        node = self._nodes.pop(name)
496
        self._size -= 2 + len(name) + len(node[0])
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
497
        self._len -= 1
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
498
        self._key = None
499
500
    def serialise(self):
501
        """Flatten the node to a bytestring.
502
503
        :return: A bytestring.
504
        """
505
        lines = ["chkroot:\n"]
3735.2.18 by Robert Collins
Partial multi-layer chk dictionary trees.
506
        lines.append("%d\n" % self._maximum_size)
507
        lines.append("%d\n" % self.prefix_width)
3735.2.17 by Robert Collins
Cache node length to avoid full iteration on __len__ calls.
508
        lines.append("%d\n" % self._len)
3735.2.8 by Robert Collins
New chk_map module for use in tree based inventory storage.
509
        for name, child in sorted(self._nodes.items()):
510
            lines.append("%s\x00%s\n" % (name, child[0]))
511
        return "".join(lines)
512
513
514
class ValueNode(object):
515
    """A value in a CHKMap."""
516
517
    def __init__(self, value):
518
        """Create a ValueNode.
519
520
        :param value: The value of this node, must be a bytestring.
521
        """
522
        self.value = value
523
524
    @classmethod
525
    def deserialise(klass, bytes):
526
        """Get a ValueNode from a serialised bytestring.
527
        
528
        :param bytes: The bytes returned by an earlier serialisation.
529
        """
530
        if not bytes.startswith("chkvalue:\n"):
531
            raise ValueError("not a chkvalue %r" % bytes)
532
        return ValueNode(bytes[10:])
533
534
    def serialise(self):
535
        """Flatten the value to a bytestring.
536
537
        :return: A bytestring.
538
        """
539
        return "chkvalue:\n" + self.value
3735.2.16 by Robert Collins
Untested extensions to support repodetails
540
541
    def refs(self):
542
        """ValueNodes have no refs within the dict."""
543
        return []
544
545
546
def _deserialise(bytes, key):
547
    """Helper for repositorydetails - convert bytes to a node."""
548
    if bytes.startswith("chkvalue:\n"):
549
        return ValueNode.deserialise(bytes)
550
    elif bytes.startswith("chkroot:\n"):
551
        result = RootNode()
552
        result.deserialise(bytes, key)
553
        return result
554
    else:
555
        raise AssertionError("Unknown node type.")