1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import
22
from .lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
42
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
53
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
54
_OPTION_ROW_LENGTHS = b"row_lengths="
55
_LEAF_FLAG = b"type=leaf\n"
56
_INTERNAL_FLAG = b"type=internal\n"
57
_INTERNAL_OFFSET = b"offset="
47
59
_RESERVED_HEADER_BYTES = 120
68
80
def finish_node(self, pad=True):
69
81
byte_lines, _, padding = self.writer.finish()
70
82
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
83
self.spool = BytesIO()
73
85
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
74
86
elif self.nodes == 1:
158
170
:param references: An iterable of iterables of keys. Each is a
159
171
reference to another key.
160
172
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
173
bytes as long as it does not contain \\0 or \\n.
163
175
# Ensure that 'key' is a StaticTuple
164
176
key = static_tuple.StaticTuple.from_sequence(key).intern()
193
205
new_backing_file, size = self._spill_mem_keys_without_combining()
194
206
# Note: The transport here isn't strictly needed, because we will use
195
207
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
208
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
210
# GC will clean up the file
198
211
new_backing._file = new_backing_file
199
212
if self._combine_backing_indices:
276
289
yield (self,) + selected[1][1:]
277
290
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
292
current_values[pos] = next(iterators_to_combine[pos])
280
293
except StopIteration:
281
294
current_values[pos] = None
289
302
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
306
if rows[-1].writer is None:
293
307
# opening a new leaf chunk;
294
309
for pos, internal_row in enumerate(rows[:-1]):
295
310
# flesh out any internal nodes that are needed to
296
311
# preserve the height of the tree
315
330
optimize_for_size=self._optimize_for_size)
316
331
rows[-1].writer.write(_LEAF_FLAG)
317
332
if rows[-1].writer.write(line):
333
# if we failed to write, despite having an empty page to write to,
334
# then line is too big. raising the error avoids infinite recursion
335
# searching for a suitably large page that will not be found.
337
raise errors.BadIndexKey(string_key)
318
338
# this key did not fit in the node:
319
339
rows[-1].finish_node()
320
340
key_line = string_key + "\n"
383
403
self.reference_lists)
384
404
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
405
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
406
pad = (not isinstance(row, _LeafBuilderRow))
387
407
row.finish_node(pad=pad)
388
408
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
409
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
410
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
411
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
412
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
413
lines.append(_OPTION_ROW_LENGTHS + b','.join(
414
map(bytes, row_lengths)) + b'\n')
394
415
if row_lengths and row_lengths[-1] > 1:
395
416
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
419
result.writelines(lines)
399
420
position = sum(map(len, lines))
416
437
position = 0 # Only the root row actually has an offset
417
438
copied_len = osutils.pumpfile(row.spool, result)
418
439
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
440
if not isinstance(row, _LeafBuilderRow):
420
441
raise AssertionError("Incorrect amount of data copied"
421
442
" expected: %d, got: %d"
422
443
% ((row.nodes - 1) * _PAGE_SIZE,
524
543
yield (self,) + node[1:]
525
544
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
546
index._sanity_check_key(self, key)
533
548
node = self._nodes[key]
539
554
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
556
nodes_by_key = self._get_nodes_by_key()
557
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
560
def _get_nodes_by_key(self):
575
561
if self._nodes_by_key is None:
576
562
nodes_by_key = {}
577
563
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
564
for key, (references, value) in viewitems(self._nodes):
579
565
key_dict = nodes_by_key
580
566
for subkey in key[:-1]:
581
567
key_dict = key_dict.setdefault(subkey, {})
582
568
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
570
for key, (references, value) in viewitems(self._nodes):
585
571
key_dict = nodes_by_key
586
572
for subkey in key[:-1]:
587
573
key_dict = key_dict.setdefault(subkey, {})
601
587
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
590
class _LeafNode(dict):
605
591
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
593
__slots__ = ('min_key', 'max_key', '_keys')
609
595
def __init__(self, bytes, key_length, ref_list_length):
610
596
"""Parse bytes to create a leaf node object."""
616
602
self.max_key = key_list[-1][0]
618
604
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
605
super(_LeafNode, self).__init__(key_list)
606
self._keys = dict(self)
609
"""Return a sorted list of (key, (value, refs)) items"""
610
items = sorted(self.items())
614
"""Return a sorted list of all keys."""
615
keys = sorted(self.keys())
622
619
class _InternalNode(object):
636
633
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
636
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
637
# to recheck performance and perhaps adapt StaticTuple to adjust.
638
nodes.append(as_st(line.split(b'\0')).intern())
671
670
self._recommended_pages = self._compute_recommended_pages()
672
671
self._root_node = None
673
672
self._base_offset = offset
673
self._leaf_factory = _LeafNode
674
674
# Default max size is 100,000 leave values
675
675
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
676
if unlimited_cache:
689
689
def __eq__(self, other):
690
690
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
692
isinstance(self, type(other)) and
693
693
self._transport == other._transport and
694
694
self._name == other._name and
695
695
self._size == other._size)
787
787
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
788
# Read whatever is left
789
789
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
790
expanded = [x for x in range(total_pages)
791
791
if x not in cached_offsets]
793
expanded = range(total_pages)
793
expanded = list(range(total_pages))
794
794
if 'index' in debug.debug_flags:
795
795
trace.mutter(' reading all unread pages: %s', expanded)
910
910
def _get_offsets_to_cached_pages(self):
911
911
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
912
cached_offsets = set(self._internal_node_cache)
913
# cache may be dict or LRUCache, keys() is the common method
913
914
cached_offsets.update(self._leaf_node_cache.keys())
914
915
if self._root_node is not None:
915
916
cached_offsets.add(0)
948
949
def _cache_leaf_values(self, nodes):
949
950
"""Cache directly from key => value, skipping the btree."""
950
951
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
952
for node in viewvalues(nodes):
953
for key, value in node.all_items():
953
954
if key in self._leaf_value_cache:
954
955
# Don't add the rest of the keys, we've seen this node
979
980
if self._row_offsets[-1] == 1:
980
981
# There is only the root node, and we read that via key_count()
981
982
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
983
for key, (value, refs) in self._root_node.all_items():
983
984
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
986
for key, (value, refs) in self._root_node.all_items():
986
987
yield (self, key, value)
988
989
start_of_leaves = self._row_offsets[-2]
989
990
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
991
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
992
if needed_offsets == [0]:
992
993
# Special case when we only have a root node, as we have already
993
994
# read everything
998
999
# for spilling index builds to disk.
999
1000
if self.node_ref_lists:
1000
1001
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1002
for key, (value, refs) in node.all_items():
1002
1003
yield (self, key, value, refs)
1004
1005
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1006
for key, (value, refs) in node.all_items():
1006
1007
yield (self, key, value)
1032
1033
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1034
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1035
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1037
# elif bisect_steps < iter_steps:
1038
1039
# for key in in_keys:
1041
1042
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1043
in_keys_iter = iter(in_keys)
1043
1044
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1045
cur_in_key = next(in_keys_iter)
1046
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1048
class InputDone(Exception): pass
1048
1049
class FixedDone(Exception): pass
1064
1065
while cur_in_key < cur_fixed_key:
1065
1066
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1068
cur_in_key = next(in_keys_iter)
1068
1069
except StopIteration:
1069
1070
raise InputDone
1070
1071
# At this point cur_in_key must be >= cur_fixed_key
1170
1171
node = nodes[node_index]
1171
1172
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1173
if next_sub_key in node:
1174
value, refs = node[next_sub_key]
1174
1175
if self.node_ref_lists:
1175
1176
yield (self, next_sub_key, value, refs)
1244
1245
# sub_keys is all of the keys we are looking for that should exist
1245
1246
# on this page, if they aren't here, then they won't be found
1246
1247
node = nodes[node_index]
1247
node_keys = node.keys
1248
1248
parents_to_check = set()
1249
1249
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1250
if next_sub_key not in node:
1251
1251
# This one is just not present in the index at all
1252
1252
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1254
value, refs = node[next_sub_key]
1255
1255
parent_keys = refs[ref_list_num]
1256
1256
parent_map[next_sub_key] = parent_keys
1257
1257
parents_to_check.update(parent_keys)
1264
1264
while parents_to_check:
1265
1265
next_parents_to_check = set()
1266
1266
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1268
value, refs = node[key]
1269
1269
parent_keys = refs[ref_list_num]
1270
1270
parent_map[key] = parent_keys
1271
1271
next_parents_to_check.update(parent_keys)
1333
1333
self._get_root_node()
1334
1334
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1335
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1336
# current breezy) just suck the entire index and iterate in memory.
1338
1338
if self.node_ref_lists:
1339
1339
if self._key_length == 1:
1365
1365
key_dict[key[-1]] = key_value
1366
1366
if self._key_length == 1:
1367
1367
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1368
index._sanity_check_key(self, key)
1374
1370
if self.node_ref_lists:
1375
1371
value, node_refs = nodes[key]
1379
1375
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1378
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1381
def key_count(self):
1419
1382
"""Return an estimate of the number of keys in this index.
1471
1434
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
1435
raise errors.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1437
self._row_lengths = [int(length) for length in
1438
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1440
except ValueError:
1478
1441
raise errors.BadIndexOptions(self)
1479
1442
self._compute_row_offsets()
1514
1477
self._size = num_bytes - base_offset
1515
1478
# the whole thing should be parsed out of 'bytes'
1516
1479
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1480
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1483
if offset > self._size:
1546
1509
bytes = zlib.decompress(data)
1547
1510
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1511
node = self._leaf_factory(bytes, self._key_length,
1512
self.node_ref_lists)
1549
1513
elif bytes.startswith(_INTERNAL_FLAG):
1550
1514
node = _InternalNode(bytes)
1566
1530
# We shouldn't be reading anything anyway
1568
1532
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1533
for node in self._read_nodes(list(range(start_node, node_end))):
1537
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1540
from breezy import _btree_serializer_pyx as _btree_serializer
1541
_gcchk_factory = _btree_serializer._parse_into_chk
1542
except ImportError as e:
1576
1543
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1544
from breezy import _btree_serializer_py as _btree_serializer