1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import
22
from .lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
42
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
41
53
_BTSIGNATURE = "B+Tree Graph Index 2\n"
68
80
def finish_node(self, pad=True):
69
81
byte_lines, _, padding = self.writer.finish()
70
82
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
83
self.spool = BytesIO()
73
85
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
74
86
elif self.nodes == 1:
158
170
:param references: An iterable of iterables of keys. Each is a
159
171
reference to another key.
160
172
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
173
bytes as long as it does not contain \\0 or \\n.
163
175
# Ensure that 'key' is a StaticTuple
164
176
key = static_tuple.StaticTuple.from_sequence(key).intern()
193
205
new_backing_file, size = self._spill_mem_keys_without_combining()
194
206
# Note: The transport here isn't strictly needed, because we will use
195
207
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
208
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
210
# GC will clean up the file
198
211
new_backing._file = new_backing_file
199
212
if self._combine_backing_indices:
276
289
yield (self,) + selected[1][1:]
277
290
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
292
current_values[pos] = next(iterators_to_combine[pos])
280
293
except StopIteration:
281
294
current_values[pos] = None
289
302
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
306
if rows[-1].writer is None:
293
307
# opening a new leaf chunk;
294
309
for pos, internal_row in enumerate(rows[:-1]):
295
310
# flesh out any internal nodes that are needed to
296
311
# preserve the height of the tree
315
330
optimize_for_size=self._optimize_for_size)
316
331
rows[-1].writer.write(_LEAF_FLAG)
317
332
if rows[-1].writer.write(line):
333
# if we failed to write, despite having an empty page to write to,
334
# then line is too big. raising the error avoids infinite recursion
335
# searching for a suitably large page that will not be found.
337
raise errors.BadIndexKey(string_key)
318
338
# this key did not fit in the node:
319
339
rows[-1].finish_node()
320
340
key_line = string_key + "\n"
383
403
self.reference_lists)
384
404
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
405
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
406
pad = (not isinstance(row, _LeafBuilderRow))
387
407
row.finish_node(pad=pad)
388
408
lines = [_BTSIGNATURE]
389
409
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
394
414
if row_lengths and row_lengths[-1] > 1:
395
415
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
418
result.writelines(lines)
399
419
position = sum(map(len, lines))
416
436
position = 0 # Only the root row actually has an offset
417
437
copied_len = osutils.pumpfile(row.spool, result)
418
438
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
439
if not isinstance(row, _LeafBuilderRow):
420
440
raise AssertionError("Incorrect amount of data copied"
421
441
" expected: %d, got: %d"
422
442
% ((row.nodes - 1) * _PAGE_SIZE,
524
542
yield (self,) + node[1:]
525
543
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
545
index._sanity_check_key(self, key)
533
547
node = self._nodes[key]
539
553
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
555
nodes_by_key = self._get_nodes_by_key()
556
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
559
def _get_nodes_by_key(self):
575
560
if self._nodes_by_key is None:
576
561
nodes_by_key = {}
577
562
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
563
for key, (references, value) in viewitems(self._nodes):
579
564
key_dict = nodes_by_key
580
565
for subkey in key[:-1]:
581
566
key_dict = key_dict.setdefault(subkey, {})
582
567
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
569
for key, (references, value) in viewitems(self._nodes):
585
570
key_dict = nodes_by_key
586
571
for subkey in key[:-1]:
587
572
key_dict = key_dict.setdefault(subkey, {})
601
586
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
589
class _LeafNode(dict):
605
590
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
592
__slots__ = ('min_key', 'max_key', '_keys')
609
594
def __init__(self, bytes, key_length, ref_list_length):
610
595
"""Parse bytes to create a leaf node object."""
616
601
self.max_key = key_list[-1][0]
618
603
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
604
super(_LeafNode, self).__init__(key_list)
605
self._keys = dict(self)
608
"""Return a sorted list of (key, (value, refs)) items"""
609
items = sorted(self.items())
613
"""Return a sorted list of all keys."""
614
keys = sorted(self.keys())
622
618
class _InternalNode(object):
636
632
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
635
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
636
# to recheck performance and perhaps adapt StaticTuple to adjust.
637
nodes.append(as_st(line.split(b'\0')).intern())
671
669
self._recommended_pages = self._compute_recommended_pages()
672
670
self._root_node = None
673
671
self._base_offset = offset
672
self._leaf_factory = _LeafNode
674
673
# Default max size is 100,000 leave values
675
674
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
675
if unlimited_cache:
689
688
def __eq__(self, other):
690
689
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
691
isinstance(self, type(other)) and
693
692
self._transport == other._transport and
694
693
self._name == other._name and
695
694
self._size == other._size)
787
786
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
787
# Read whatever is left
789
788
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
789
expanded = [x for x in range(total_pages)
791
790
if x not in cached_offsets]
793
expanded = range(total_pages)
792
expanded = list(range(total_pages))
794
793
if 'index' in debug.debug_flags:
795
794
trace.mutter(' reading all unread pages: %s', expanded)
910
909
def _get_offsets_to_cached_pages(self):
911
910
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
911
cached_offsets = set(self._internal_node_cache)
912
# cache may be dict or LRUCache, keys() is the common method
913
913
cached_offsets.update(self._leaf_node_cache.keys())
914
914
if self._root_node is not None:
915
915
cached_offsets.add(0)
948
948
def _cache_leaf_values(self, nodes):
949
949
"""Cache directly from key => value, skipping the btree."""
950
950
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
951
for node in viewvalues(nodes):
952
for key, value in node.all_items():
953
953
if key in self._leaf_value_cache:
954
954
# Don't add the rest of the keys, we've seen this node
979
979
if self._row_offsets[-1] == 1:
980
980
# There is only the root node, and we read that via key_count()
981
981
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
982
for key, (value, refs) in self._root_node.all_items():
983
983
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
985
for key, (value, refs) in self._root_node.all_items():
986
986
yield (self, key, value)
988
988
start_of_leaves = self._row_offsets[-2]
989
989
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
990
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
991
if needed_offsets == [0]:
992
992
# Special case when we only have a root node, as we have already
993
993
# read everything
998
998
# for spilling index builds to disk.
999
999
if self.node_ref_lists:
1000
1000
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1001
for key, (value, refs) in node.all_items():
1002
1002
yield (self, key, value, refs)
1004
1004
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1005
for key, (value, refs) in node.all_items():
1006
1006
yield (self, key, value)
1032
1032
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1033
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1034
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1035
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1036
# elif bisect_steps < iter_steps:
1038
1038
# for key in in_keys:
1041
1041
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1042
in_keys_iter = iter(in_keys)
1043
1043
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1044
cur_in_key = next(in_keys_iter)
1045
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1047
class InputDone(Exception): pass
1048
1048
class FixedDone(Exception): pass
1064
1064
while cur_in_key < cur_fixed_key:
1065
1065
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1067
cur_in_key = next(in_keys_iter)
1068
1068
except StopIteration:
1069
1069
raise InputDone
1070
1070
# At this point cur_in_key must be >= cur_fixed_key
1170
1170
node = nodes[node_index]
1171
1171
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1172
if next_sub_key in node:
1173
value, refs = node[next_sub_key]
1174
1174
if self.node_ref_lists:
1175
1175
yield (self, next_sub_key, value, refs)
1244
1244
# sub_keys is all of the keys we are looking for that should exist
1245
1245
# on this page, if they aren't here, then they won't be found
1246
1246
node = nodes[node_index]
1247
node_keys = node.keys
1248
1247
parents_to_check = set()
1249
1248
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1249
if next_sub_key not in node:
1251
1250
# This one is just not present in the index at all
1252
1251
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1253
value, refs = node[next_sub_key]
1255
1254
parent_keys = refs[ref_list_num]
1256
1255
parent_map[next_sub_key] = parent_keys
1257
1256
parents_to_check.update(parent_keys)
1264
1263
while parents_to_check:
1265
1264
next_parents_to_check = set()
1266
1265
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1267
value, refs = node[key]
1269
1268
parent_keys = refs[ref_list_num]
1270
1269
parent_map[key] = parent_keys
1271
1270
next_parents_to_check.update(parent_keys)
1333
1332
self._get_root_node()
1334
1333
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1334
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1335
# current breezy) just suck the entire index and iterate in memory.
1338
1337
if self.node_ref_lists:
1339
1338
if self._key_length == 1:
1365
1364
key_dict[key[-1]] = key_value
1366
1365
if self._key_length == 1:
1367
1366
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1367
index._sanity_check_key(self, key)
1374
1369
if self.node_ref_lists:
1375
1370
value, node_refs = nodes[key]
1379
1374
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1377
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1380
def key_count(self):
1419
1381
"""Return an estimate of the number of keys in this index.
1471
1433
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
1434
raise errors.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1436
self._row_lengths = [int(length) for length in
1475
1437
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1477
1439
except ValueError:
1478
1440
raise errors.BadIndexOptions(self)
1479
1441
self._compute_row_offsets()
1514
1476
self._size = num_bytes - base_offset
1515
1477
# the whole thing should be parsed out of 'bytes'
1516
1478
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1479
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1482
if offset > self._size:
1546
1508
bytes = zlib.decompress(data)
1547
1509
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1510
node = self._leaf_factory(bytes, self._key_length,
1511
self.node_ref_lists)
1549
1512
elif bytes.startswith(_INTERNAL_FLAG):
1550
1513
node = _InternalNode(bytes)
1566
1529
# We shouldn't be reading anything anyway
1568
1531
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1532
for node in self._read_nodes(list(range(start_node, node_end))):
1536
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1539
from breezy import _btree_serializer_pyx as _btree_serializer
1540
_gcchk_factory = _btree_serializer._parse_into_chk
1541
except ImportError as e:
1576
1542
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1543
from breezy import _btree_serializer_py as _btree_serializer