1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
44
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
45
from ..sixish import (
55
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
56
_OPTION_ROW_LENGTHS = b"row_lengths="
57
_LEAF_FLAG = b"type=leaf\n"
58
_INTERNAL_FLAG = b"type=internal\n"
59
_INTERNAL_OFFSET = b"offset="
47
61
_RESERVED_HEADER_BYTES = 120
68
82
def finish_node(self, pad=True):
69
83
byte_lines, _, padding = self.writer.finish()
70
84
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
85
self.spool = BytesIO()
73
87
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
74
88
elif self.nodes == 1:
158
172
:param references: An iterable of iterables of keys. Each is a
159
173
reference to another key.
160
174
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
175
bytes as long as it does not contain \\0 or \\n.
163
177
# Ensure that 'key' is a StaticTuple
164
178
key = static_tuple.StaticTuple.from_sequence(key).intern()
193
207
new_backing_file, size = self._spill_mem_keys_without_combining()
194
208
# Note: The transport here isn't strictly needed, because we will use
195
209
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
210
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
212
# GC will clean up the file
198
213
new_backing._file = new_backing_file
199
214
if self._combine_backing_indices:
276
291
yield (self,) + selected[1][1:]
277
292
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
294
current_values[pos] = next(iterators_to_combine[pos])
280
295
except StopIteration:
281
296
current_values[pos] = None
289
304
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
308
if rows[-1].writer is None:
293
309
# opening a new leaf chunk;
294
311
for pos, internal_row in enumerate(rows[:-1]):
295
312
# flesh out any internal nodes that are needed to
296
313
# preserve the height of the tree
315
332
optimize_for_size=self._optimize_for_size)
316
333
rows[-1].writer.write(_LEAF_FLAG)
317
334
if rows[-1].writer.write(line):
335
# if we failed to write, despite having an empty page to write to,
336
# then line is too big. raising the error avoids infinite recursion
337
# searching for a suitably large page that will not be found.
339
raise errors.BadIndexKey(string_key)
318
340
# this key did not fit in the node:
319
341
rows[-1].finish_node()
320
342
key_line = string_key + "\n"
383
405
self.reference_lists)
384
406
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
407
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
408
pad = (not isinstance(row, _LeafBuilderRow))
387
409
row.finish_node(pad=pad)
388
410
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
411
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
412
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
413
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
414
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
415
lines.append(_OPTION_ROW_LENGTHS + ','.join(
416
map(str, row_lengths)).encode('ascii') + b'\n')
394
417
if row_lengths and row_lengths[-1] > 1:
395
418
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
421
result.writelines(lines)
399
422
position = sum(map(len, lines))
416
439
position = 0 # Only the root row actually has an offset
417
440
copied_len = osutils.pumpfile(row.spool, result)
418
441
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
442
if not isinstance(row, _LeafBuilderRow):
420
443
raise AssertionError("Incorrect amount of data copied"
421
444
" expected: %d, got: %d"
422
445
% ((row.nodes - 1) * _PAGE_SIZE,
524
545
yield (self,) + node[1:]
525
546
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
548
index._sanity_check_key(self, key)
533
550
node = self._nodes[key]
539
556
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
558
nodes_by_key = self._get_nodes_by_key()
559
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
562
def _get_nodes_by_key(self):
575
563
if self._nodes_by_key is None:
576
564
nodes_by_key = {}
577
565
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
566
for key, (references, value) in viewitems(self._nodes):
579
567
key_dict = nodes_by_key
580
568
for subkey in key[:-1]:
581
569
key_dict = key_dict.setdefault(subkey, {})
582
570
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
572
for key, (references, value) in viewitems(self._nodes):
585
573
key_dict = nodes_by_key
586
574
for subkey in key[:-1]:
587
575
key_dict = key_dict.setdefault(subkey, {})
601
589
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
592
class _LeafNode(dict):
605
593
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
595
__slots__ = ('min_key', 'max_key', '_keys')
609
597
def __init__(self, bytes, key_length, ref_list_length):
610
598
"""Parse bytes to create a leaf node object."""
616
604
self.max_key = key_list[-1][0]
618
606
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
607
super(_LeafNode, self).__init__(key_list)
608
self._keys = dict(self)
611
"""Return a sorted list of (key, (value, refs)) items"""
612
items = sorted(self.items())
616
"""Return a sorted list of all keys."""
617
keys = sorted(self.keys())
622
621
class _InternalNode(object):
636
635
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
638
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
639
# to recheck performance and perhaps adapt StaticTuple to adjust.
640
nodes.append(as_st(line.split(b'\0')).intern())
671
672
self._recommended_pages = self._compute_recommended_pages()
672
673
self._root_node = None
673
674
self._base_offset = offset
675
self._leaf_factory = _LeafNode
674
676
# Default max size is 100,000 leave values
675
677
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
678
if unlimited_cache:
689
691
def __eq__(self, other):
690
692
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
694
isinstance(self, type(other)) and
693
695
self._transport == other._transport and
694
696
self._name == other._name and
695
697
self._size == other._size)
787
789
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
790
# Read whatever is left
789
791
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
792
expanded = [x for x in range(total_pages)
791
793
if x not in cached_offsets]
793
expanded = range(total_pages)
795
expanded = list(range(total_pages))
794
796
if 'index' in debug.debug_flags:
795
797
trace.mutter(' reading all unread pages: %s', expanded)
910
912
def _get_offsets_to_cached_pages(self):
911
913
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
914
cached_offsets = set(self._internal_node_cache)
915
# cache may be dict or LRUCache, keys() is the common method
913
916
cached_offsets.update(self._leaf_node_cache.keys())
914
917
if self._root_node is not None:
915
918
cached_offsets.add(0)
948
951
def _cache_leaf_values(self, nodes):
949
952
"""Cache directly from key => value, skipping the btree."""
950
953
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
954
for node in viewvalues(nodes):
955
for key, value in node.all_items():
953
956
if key in self._leaf_value_cache:
954
957
# Don't add the rest of the keys, we've seen this node
979
982
if self._row_offsets[-1] == 1:
980
983
# There is only the root node, and we read that via key_count()
981
984
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
985
for key, (value, refs) in self._root_node.all_items():
983
986
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
988
for key, (value, refs) in self._root_node.all_items():
986
989
yield (self, key, value)
988
991
start_of_leaves = self._row_offsets[-2]
989
992
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
993
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
994
if needed_offsets == [0]:
992
995
# Special case when we only have a root node, as we have already
993
996
# read everything
998
1001
# for spilling index builds to disk.
999
1002
if self.node_ref_lists:
1000
1003
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1004
for key, (value, refs) in node.all_items():
1002
1005
yield (self, key, value, refs)
1004
1007
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1008
for key, (value, refs) in node.all_items():
1006
1009
yield (self, key, value)
1032
1035
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1036
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1037
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1038
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1039
# elif bisect_steps < iter_steps:
1038
1041
# for key in in_keys:
1041
1044
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1045
in_keys_iter = iter(in_keys)
1043
1046
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1047
cur_in_key = next(in_keys_iter)
1048
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1050
class InputDone(Exception): pass
1048
1051
class FixedDone(Exception): pass
1064
1067
while cur_in_key < cur_fixed_key:
1065
1068
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1070
cur_in_key = next(in_keys_iter)
1068
1071
except StopIteration:
1069
1072
raise InputDone
1070
1073
# At this point cur_in_key must be >= cur_fixed_key
1170
1173
node = nodes[node_index]
1171
1174
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1175
if next_sub_key in node:
1176
value, refs = node[next_sub_key]
1174
1177
if self.node_ref_lists:
1175
1178
yield (self, next_sub_key, value, refs)
1244
1247
# sub_keys is all of the keys we are looking for that should exist
1245
1248
# on this page, if they aren't here, then they won't be found
1246
1249
node = nodes[node_index]
1247
node_keys = node.keys
1248
1250
parents_to_check = set()
1249
1251
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1252
if next_sub_key not in node:
1251
1253
# This one is just not present in the index at all
1252
1254
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1256
value, refs = node[next_sub_key]
1255
1257
parent_keys = refs[ref_list_num]
1256
1258
parent_map[next_sub_key] = parent_keys
1257
1259
parents_to_check.update(parent_keys)
1264
1266
while parents_to_check:
1265
1267
next_parents_to_check = set()
1266
1268
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1270
value, refs = node[key]
1269
1271
parent_keys = refs[ref_list_num]
1270
1272
parent_map[key] = parent_keys
1271
1273
next_parents_to_check.update(parent_keys)
1333
1335
self._get_root_node()
1334
1336
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1337
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1338
# current breezy) just suck the entire index and iterate in memory.
1338
1340
if self.node_ref_lists:
1339
1341
if self._key_length == 1:
1365
1367
key_dict[key[-1]] = key_value
1366
1368
if self._key_length == 1:
1367
1369
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1370
index._sanity_check_key(self, key)
1374
1372
if self.node_ref_lists:
1375
1373
value, node_refs = nodes[key]
1379
1377
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1380
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1383
def key_count(self):
1419
1384
"""Return an estimate of the number of keys in this index.
1471
1436
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
1437
raise errors.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1439
self._row_lengths = [int(length) for length in
1440
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1442
except ValueError:
1478
1443
raise errors.BadIndexOptions(self)
1479
1444
self._compute_row_offsets()
1514
1479
self._size = num_bytes - base_offset
1515
1480
# the whole thing should be parsed out of 'bytes'
1516
1481
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1482
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1485
if offset > self._size:
1546
1511
bytes = zlib.decompress(data)
1547
1512
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1513
node = self._leaf_factory(bytes, self._key_length,
1514
self.node_ref_lists)
1549
1515
elif bytes.startswith(_INTERNAL_FLAG):
1550
1516
node = _InternalNode(bytes)
1566
1532
# We shouldn't be reading anything anyway
1568
1534
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1535
for node in self._read_nodes(list(range(start_node, node_end))):
1539
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1542
from . import _btree_serializer_pyx as _btree_serializer
1543
_gcchk_factory = _btree_serializer._parse_into_chk
1544
except ImportError as e:
1576
1545
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1546
from . import _btree_serializer_py as _btree_serializer