1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
44
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
45
from ..sixish import (
55
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
56
_OPTION_ROW_LENGTHS = b"row_lengths="
57
_LEAF_FLAG = b"type=leaf\n"
58
_INTERNAL_FLAG = b"type=internal\n"
59
_INTERNAL_OFFSET = b"offset="
47
61
_RESERVED_HEADER_BYTES = 120
68
82
def finish_node(self, pad=True):
69
83
byte_lines, _, padding = self.writer.finish()
70
84
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
85
self.spool = BytesIO()
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
87
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
74
88
elif self.nodes == 1:
75
89
# We got bigger than 1 node, switch to a temp file
76
90
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
158
172
:param references: An iterable of iterables of keys. Each is a
159
173
reference to another key.
160
174
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
175
bytes as long as it does not contain \\0 or \\n.
163
177
# Ensure that 'key' is a StaticTuple
164
178
key = static_tuple.StaticTuple.from_sequence(key).intern()
193
207
new_backing_file, size = self._spill_mem_keys_without_combining()
194
208
# Note: The transport here isn't strictly needed, because we will use
195
209
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
210
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
212
# GC will clean up the file
198
213
new_backing._file = new_backing_file
199
214
if self._combine_backing_indices:
276
291
yield (self,) + selected[1][1:]
277
292
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
294
current_values[pos] = next(iterators_to_combine[pos])
280
295
except StopIteration:
281
296
current_values[pos] = None
289
304
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
308
if rows[-1].writer is None:
293
309
# opening a new leaf chunk;
294
311
for pos, internal_row in enumerate(rows[:-1]):
295
312
# flesh out any internal nodes that are needed to
296
313
# preserve the height of the tree
306
323
optimize_for_size=optimize_for_size)
307
324
internal_row.writer.write(_INTERNAL_FLAG)
308
325
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
326
b"%d\n" % rows[pos + 1].nodes)
311
328
length = _PAGE_SIZE
312
329
if rows[-1].nodes == 0:
315
332
optimize_for_size=self._optimize_for_size)
316
333
rows[-1].writer.write(_LEAF_FLAG)
317
334
if rows[-1].writer.write(line):
335
# if we failed to write, despite having an empty page to write to,
336
# then line is too big. raising the error avoids infinite recursion
337
# searching for a suitably large page that will not be found.
339
raise errors.BadIndexKey(string_key)
318
340
# this key did not fit in the node:
319
341
rows[-1].finish_node()
320
key_line = string_key + "\n"
342
key_line = string_key + b"\n"
322
344
for row in reversed(rows[:-1]):
323
345
# Mark the start of the next node in the node above. If it
345
367
optimize_for_size=self._optimize_for_size)
346
368
new_row.writer.write(_INTERNAL_FLAG)
347
369
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
370
b"%d\n" % (rows[1].nodes - 1))
349
371
new_row.writer.write(key_line)
350
372
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
383
405
self.reference_lists)
384
406
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
407
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
408
pad = (not isinstance(row, _LeafBuilderRow))
387
409
row.finish_node(pad=pad)
388
410
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
411
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
412
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
413
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
414
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
415
lines.append(_OPTION_ROW_LENGTHS + ','.join(
416
map(str, row_lengths)).encode('ascii') + b'\n')
394
417
if row_lengths and row_lengths[-1] > 1:
395
418
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
421
result.writelines(lines)
399
422
position = sum(map(len, lines))
412
435
node = row.spool.read(_PAGE_SIZE)
413
436
result.write(node[reserved:])
414
437
if len(node) == _PAGE_SIZE:
415
result.write("\x00" * (reserved - position))
438
result.write(b"\x00" * (reserved - position))
416
439
position = 0 # Only the root row actually has an offset
417
440
copied_len = osutils.pumpfile(row.spool, result)
418
441
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
442
if not isinstance(row, _LeafBuilderRow):
420
443
raise AssertionError("Incorrect amount of data copied"
421
444
" expected: %d, got: %d"
422
445
% ((row.nodes - 1) * _PAGE_SIZE,
524
545
yield (self,) + node[1:]
525
546
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
548
index._sanity_check_key(self, key)
533
550
node = self._nodes[key]
539
556
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
558
nodes_by_key = self._get_nodes_by_key()
559
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
562
def _get_nodes_by_key(self):
575
563
if self._nodes_by_key is None:
576
564
nodes_by_key = {}
577
565
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
566
for key, (references, value) in viewitems(self._nodes):
579
567
key_dict = nodes_by_key
580
568
for subkey in key[:-1]:
581
569
key_dict = key_dict.setdefault(subkey, {})
582
570
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
572
for key, (references, value) in viewitems(self._nodes):
585
573
key_dict = nodes_by_key
586
574
for subkey in key[:-1]:
587
575
key_dict = key_dict.setdefault(subkey, {})
601
589
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
592
class _LeafNode(dict):
605
593
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
595
__slots__ = ('min_key', 'max_key', '_keys')
609
597
def __init__(self, bytes, key_length, ref_list_length):
610
598
"""Parse bytes to create a leaf node object."""
616
604
self.max_key = key_list[-1][0]
618
606
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
607
super(_LeafNode, self).__init__(key_list)
608
self._keys = dict(self)
611
"""Return a sorted list of (key, (value, refs)) items"""
612
items = sorted(self.items())
616
"""Return a sorted list of all keys."""
617
keys = sorted(self.keys())
622
621
class _InternalNode(object):
627
626
def __init__(self, bytes):
628
627
"""Parse bytes to create an internal node object."""
629
628
# splitlines mangles the \r delimiters.. don't use it.
630
self.keys = self._parse_lines(bytes.split('\n'))
629
self.keys = self._parse_lines(bytes.split(b'\n'))
632
631
def _parse_lines(self, lines):
634
633
self.offset = int(lines[1][7:])
635
634
as_st = static_tuple.StaticTuple.from_sequence
636
635
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
638
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
639
# to recheck performance and perhaps adapt StaticTuple to adjust.
640
nodes.append(as_st(line.split(b'\0')).intern())
671
672
self._recommended_pages = self._compute_recommended_pages()
672
673
self._root_node = None
673
674
self._base_offset = offset
675
self._leaf_factory = _LeafNode
674
676
# Default max size is 100,000 leave values
675
677
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
678
if unlimited_cache:
686
688
self._row_lengths = None
687
689
self._row_offsets = None # Start of each row, [-1] is the end
689
694
def __eq__(self, other):
690
695
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
697
isinstance(self, type(other)) and
693
698
self._transport == other._transport and
694
699
self._name == other._name and
695
700
self._size == other._size)
732
737
pages fit in that length.
734
739
recommended_read = self._transport.recommended_page_size()
735
recommended_pages = int(math.ceil(recommended_read /
740
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
737
741
return recommended_pages
739
743
def _compute_total_pages_in_index(self):
750
754
return self._row_offsets[-1]
751
755
# This is the number of pages as defined by the size of the index. They
752
756
# should be indentical.
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
757
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
754
758
return total_pages
756
760
def _expand_offsets(self, offsets):
787
791
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
792
# Read whatever is left
789
793
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
794
expanded = [x for x in range(total_pages)
791
795
if x not in cached_offsets]
793
expanded = range(total_pages)
797
expanded = list(range(total_pages))
794
798
if 'index' in debug.debug_flags:
795
799
trace.mutter(' reading all unread pages: %s', expanded)
910
914
def _get_offsets_to_cached_pages(self):
911
915
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
916
cached_offsets = set(self._internal_node_cache)
917
# cache may be dict or LRUCache, keys() is the common method
913
918
cached_offsets.update(self._leaf_node_cache.keys())
914
919
if self._root_node is not None:
915
920
cached_offsets.add(0)
948
953
def _cache_leaf_values(self, nodes):
949
954
"""Cache directly from key => value, skipping the btree."""
950
955
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
956
for node in viewvalues(nodes):
957
for key, value in node.all_items():
953
958
if key in self._leaf_value_cache:
954
959
# Don't add the rest of the keys, we've seen this node
979
984
if self._row_offsets[-1] == 1:
980
985
# There is only the root node, and we read that via key_count()
981
986
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
987
for key, (value, refs) in self._root_node.all_items():
983
988
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
990
for key, (value, refs) in self._root_node.all_items():
986
991
yield (self, key, value)
988
993
start_of_leaves = self._row_offsets[-2]
989
994
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
995
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
996
if needed_offsets == [0]:
992
997
# Special case when we only have a root node, as we have already
993
998
# read everything
998
1003
# for spilling index builds to disk.
999
1004
if self.node_ref_lists:
1000
1005
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1006
for key, (value, refs) in node.all_items():
1002
1007
yield (self, key, value, refs)
1004
1009
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1010
for key, (value, refs) in node.all_items():
1006
1011
yield (self, key, value)
1032
1037
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1038
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1039
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1040
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1041
# elif bisect_steps < iter_steps:
1038
1043
# for key in in_keys:
1041
1046
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1047
in_keys_iter = iter(in_keys)
1043
1048
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1049
cur_in_key = next(in_keys_iter)
1050
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1052
class InputDone(Exception): pass
1048
1053
class FixedDone(Exception): pass
1064
1069
while cur_in_key < cur_fixed_key:
1065
1070
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1072
cur_in_key = next(in_keys_iter)
1068
1073
except StopIteration:
1069
1074
raise InputDone
1070
1075
# At this point cur_in_key must be >= cur_fixed_key
1170
1175
node = nodes[node_index]
1171
1176
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1177
if next_sub_key in node:
1178
value, refs = node[next_sub_key]
1174
1179
if self.node_ref_lists:
1175
1180
yield (self, next_sub_key, value, refs)
1244
1249
# sub_keys is all of the keys we are looking for that should exist
1245
1250
# on this page, if they aren't here, then they won't be found
1246
1251
node = nodes[node_index]
1247
node_keys = node.keys
1248
1252
parents_to_check = set()
1249
1253
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1254
if next_sub_key not in node:
1251
1255
# This one is just not present in the index at all
1252
1256
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1258
value, refs = node[next_sub_key]
1255
1259
parent_keys = refs[ref_list_num]
1256
1260
parent_map[next_sub_key] = parent_keys
1257
1261
parents_to_check.update(parent_keys)
1264
1268
while parents_to_check:
1265
1269
next_parents_to_check = set()
1266
1270
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1272
value, refs = node[key]
1269
1273
parent_keys = refs[ref_list_num]
1270
1274
parent_map[key] = parent_keys
1271
1275
next_parents_to_check.update(parent_keys)
1333
1337
self._get_root_node()
1334
1338
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1339
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1340
# current breezy) just suck the entire index and iterate in memory.
1338
1342
if self.node_ref_lists:
1339
1343
if self._key_length == 1:
1365
1369
key_dict[key[-1]] = key_value
1366
1370
if self._key_length == 1:
1367
1371
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1372
index._sanity_check_key(self, key)
1374
1374
if self.node_ref_lists:
1375
1375
value, node_refs = nodes[key]
1379
1379
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1382
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1385
def key_count(self):
1419
1386
"""Return an estimate of the number of keys in this index.
1471
1438
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
1439
raise errors.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1441
self._row_lengths = [int(length) for length in
1442
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1444
except ValueError:
1478
1445
raise errors.BadIndexOptions(self)
1479
1446
self._compute_row_offsets()
1514
1481
self._size = num_bytes - base_offset
1515
1482
# the whole thing should be parsed out of 'bytes'
1516
1483
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1484
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1487
if offset > self._size:
1546
1513
bytes = zlib.decompress(data)
1547
1514
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1515
node = self._leaf_factory(bytes, self._key_length,
1516
self.node_ref_lists)
1549
1517
elif bytes.startswith(_INTERNAL_FLAG):
1550
1518
node = _InternalNode(bytes)
1552
1520
raise AssertionError("Unknown node type for %r" % bytes)
1553
yield offset / _PAGE_SIZE, node
1521
yield offset // _PAGE_SIZE, node
1555
1523
def _signature(self):
1556
1524
"""The file signature for this index type."""
1566
1534
# We shouldn't be reading anything anyway
1568
1536
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1537
for node in self._read_nodes(list(range(start_node, node_end))):
1541
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1544
from . import _btree_serializer_pyx as _btree_serializer
1545
_gcchk_factory = _btree_serializer._parse_into_chk
1546
except ImportError as e:
1576
1547
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1548
from . import _btree_serializer_py as _btree_serializer