1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
47
60
_RESERVED_HEADER_BYTES = 120
68
81
def finish_node(self, pad=True):
69
82
byte_lines, _, padding = self.writer.finish()
70
83
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
84
self.spool = BytesIO()
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
74
87
elif self.nodes == 1:
75
88
# We got bigger than 1 node, switch to a temp file
76
89
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
158
171
:param references: An iterable of iterables of keys. Each is a
159
172
reference to another key.
160
173
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
174
bytes as long as it does not contain \\0 or \\n.
163
176
# Ensure that 'key' is a StaticTuple
164
177
key = static_tuple.StaticTuple.from_sequence(key).intern()
165
178
# we don't care about absent_references
166
179
node_refs, _ = self._check_key_ref_value(key, references, value)
167
180
if key in self._nodes:
168
raise errors.BadIndexDuplicateKey(key, self)
181
raise index.BadIndexDuplicateKey(key, self)
169
182
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
170
183
if self._nodes_by_key is not None and self._key_length > 1:
171
184
self._update_nodes_by_key(key, value, node_refs)
193
206
new_backing_file, size = self._spill_mem_keys_without_combining()
194
207
# Note: The transport here isn't strictly needed, because we will use
195
208
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
209
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
211
# GC will clean up the file
198
212
new_backing._file = new_backing_file
199
213
if self._combine_backing_indices:
270
284
# undecorate back to (pos, node)
271
285
selected = selected[1]
272
286
if last == selected[1][1]:
273
raise errors.BadIndexDuplicateKey(last, self)
287
raise index.BadIndexDuplicateKey(last, self)
274
288
last = selected[1][1]
275
289
# Yield, with self as the index
276
290
yield (self,) + selected[1][1:]
277
291
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
293
current_values[pos] = next(iterators_to_combine[pos])
280
294
except StopIteration:
281
295
current_values[pos] = None
289
303
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
307
if rows[-1].writer is None:
293
308
# opening a new leaf chunk;
294
310
for pos, internal_row in enumerate(rows[:-1]):
295
311
# flesh out any internal nodes that are needed to
296
312
# preserve the height of the tree
306
322
optimize_for_size=optimize_for_size)
307
323
internal_row.writer.write(_INTERNAL_FLAG)
308
324
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
325
b"%d\n" % rows[pos + 1].nodes)
311
327
length = _PAGE_SIZE
312
328
if rows[-1].nodes == 0:
315
331
optimize_for_size=self._optimize_for_size)
316
332
rows[-1].writer.write(_LEAF_FLAG)
317
333
if rows[-1].writer.write(line):
334
# if we failed to write, despite having an empty page to write to,
335
# then line is too big. raising the error avoids infinite recursion
336
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
318
339
# this key did not fit in the node:
319
340
rows[-1].finish_node()
320
key_line = string_key + "\n"
341
key_line = string_key + b"\n"
322
343
for row in reversed(rows[:-1]):
323
344
# Mark the start of the next node in the node above. If it
345
366
optimize_for_size=self._optimize_for_size)
346
367
new_row.writer.write(_INTERNAL_FLAG)
347
368
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
369
b"%d\n" % (rows[1].nodes - 1))
349
370
new_row.writer.write(key_line)
350
371
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
383
404
self.reference_lists)
384
405
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
406
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
407
pad = (not isinstance(row, _LeafBuilderRow))
387
408
row.finish_node(pad=pad)
388
409
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
413
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
394
416
if row_lengths and row_lengths[-1] > 1:
395
417
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
420
result.writelines(lines)
399
421
position = sum(map(len, lines))
412
434
node = row.spool.read(_PAGE_SIZE)
413
435
result.write(node[reserved:])
414
436
if len(node) == _PAGE_SIZE:
415
result.write("\x00" * (reserved - position))
437
result.write(b"\x00" * (reserved - position))
416
438
position = 0 # Only the root row actually has an offset
417
439
copied_len = osutils.pumpfile(row.spool, result)
418
440
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
441
if not isinstance(row, _LeafBuilderRow):
420
442
raise AssertionError("Incorrect amount of data copied"
421
443
" expected: %d, got: %d"
422
444
% ((row.nodes - 1) * _PAGE_SIZE,
524
544
yield (self,) + node[1:]
525
545
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
547
index._sanity_check_key(self, key)
533
549
node = self._nodes[key]
539
555
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
561
def _get_nodes_by_key(self):
575
562
if self._nodes_by_key is None:
576
563
nodes_by_key = {}
577
564
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
565
for key, (references, value) in viewitems(self._nodes):
579
566
key_dict = nodes_by_key
580
567
for subkey in key[:-1]:
581
568
key_dict = key_dict.setdefault(subkey, {})
582
569
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
571
for key, (references, value) in viewitems(self._nodes):
585
572
key_dict = nodes_by_key
586
573
for subkey in key[:-1]:
587
574
key_dict = key_dict.setdefault(subkey, {})
600
587
def validate(self):
601
588
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
590
def __lt__(self, other):
591
if isinstance(other, type(self)):
592
return self._nodes < other._nodes
593
# Always sort existing indexes before ones that are still being built.
594
if isinstance(other, BTreeGraphIndex):
599
class _LeafNode(dict):
605
600
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
602
__slots__ = ('min_key', 'max_key', '_keys')
609
604
def __init__(self, bytes, key_length, ref_list_length):
610
605
"""Parse bytes to create a leaf node object."""
616
611
self.max_key = key_list[-1][0]
618
613
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
614
super(_LeafNode, self).__init__(key_list)
615
self._keys = dict(self)
618
"""Return a sorted list of (key, (value, refs)) items"""
619
items = sorted(self.items())
623
"""Return a sorted list of all keys."""
624
keys = sorted(self.keys())
622
628
class _InternalNode(object):
627
633
def __init__(self, bytes):
628
634
"""Parse bytes to create an internal node object."""
629
635
# splitlines mangles the \r delimiters.. don't use it.
630
self.keys = self._parse_lines(bytes.split('\n'))
636
self.keys = self._parse_lines(bytes.split(b'\n'))
632
638
def _parse_lines(self, lines):
634
640
self.offset = int(lines[1][7:])
635
641
as_st = static_tuple.StaticTuple.from_sequence
636
642
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
645
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
646
# to recheck performance and perhaps adapt StaticTuple to adjust.
647
nodes.append(as_st(line.split(b'\0')).intern())
671
679
self._recommended_pages = self._compute_recommended_pages()
672
680
self._root_node = None
673
681
self._base_offset = offset
682
self._leaf_factory = _LeafNode
674
683
# Default max size is 100,000 leave values
675
684
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
685
if unlimited_cache:
686
695
self._row_lengths = None
687
696
self._row_offsets = None # Start of each row, [-1] is the end
689
701
def __eq__(self, other):
690
702
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
704
isinstance(self, type(other)) and
693
705
self._transport == other._transport and
694
706
self._name == other._name and
695
707
self._size == other._size)
709
def __lt__(self, other):
710
if isinstance(other, type(self)):
711
return ((self._transport, self._name, self._size) <
712
(other._transport, other._name, other._size))
713
# Always sort existing indexes before ones that are still being built.
714
if isinstance(other, BTreeBuilder):
697
718
def __ne__(self, other):
698
719
return not self.__eq__(other)
732
753
pages fit in that length.
734
755
recommended_read = self._transport.recommended_page_size()
735
recommended_pages = int(math.ceil(recommended_read /
756
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
737
757
return recommended_pages
739
759
def _compute_total_pages_in_index(self):
750
770
return self._row_offsets[-1]
751
771
# This is the number of pages as defined by the size of the index. They
752
772
# should be indentical.
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
773
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
754
774
return total_pages
756
776
def _expand_offsets(self, offsets):
787
807
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
808
# Read whatever is left
789
809
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
810
expanded = [x for x in range(total_pages)
791
811
if x not in cached_offsets]
793
expanded = range(total_pages)
813
expanded = list(range(total_pages))
794
814
if 'index' in debug.debug_flags:
795
815
trace.mutter(' reading all unread pages: %s', expanded)
910
930
def _get_offsets_to_cached_pages(self):
911
931
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
932
cached_offsets = set(self._internal_node_cache)
933
# cache may be dict or LRUCache, keys() is the common method
913
934
cached_offsets.update(self._leaf_node_cache.keys())
914
935
if self._root_node is not None:
915
936
cached_offsets.add(0)
948
969
def _cache_leaf_values(self, nodes):
949
970
"""Cache directly from key => value, skipping the btree."""
950
971
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
972
for node in viewvalues(nodes):
973
for key, value in node.all_items():
953
974
if key in self._leaf_value_cache:
954
975
# Don't add the rest of the keys, we've seen this node
979
1000
if self._row_offsets[-1] == 1:
980
1001
# There is only the root node, and we read that via key_count()
981
1002
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
1003
for key, (value, refs) in self._root_node.all_items():
983
1004
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1006
for key, (value, refs) in self._root_node.all_items():
986
1007
yield (self, key, value)
988
1009
start_of_leaves = self._row_offsets[-2]
989
1010
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1011
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
1012
if needed_offsets == [0]:
992
1013
# Special case when we only have a root node, as we have already
993
1014
# read everything
998
1019
# for spilling index builds to disk.
999
1020
if self.node_ref_lists:
1000
1021
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1022
for key, (value, refs) in node.all_items():
1002
1023
yield (self, key, value, refs)
1004
1025
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1026
for key, (value, refs) in node.all_items():
1006
1027
yield (self, key, value)
1032
1053
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1054
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1055
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1056
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1057
# elif bisect_steps < iter_steps:
1038
1059
# for key in in_keys:
1041
1062
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1063
in_keys_iter = iter(in_keys)
1043
1064
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1065
cur_in_key = next(in_keys_iter)
1066
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1068
class InputDone(Exception): pass
1048
1069
class FixedDone(Exception): pass
1064
1085
while cur_in_key < cur_fixed_key:
1065
1086
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1088
cur_in_key = next(in_keys_iter)
1068
1089
except StopIteration:
1069
1090
raise InputDone
1070
1091
# At this point cur_in_key must be >= cur_fixed_key
1170
1191
node = nodes[node_index]
1171
1192
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1193
if next_sub_key in node:
1194
value, refs = node[next_sub_key]
1174
1195
if self.node_ref_lists:
1175
1196
yield (self, next_sub_key, value, refs)
1244
1265
# sub_keys is all of the keys we are looking for that should exist
1245
1266
# on this page, if they aren't here, then they won't be found
1246
1267
node = nodes[node_index]
1247
node_keys = node.keys
1248
1268
parents_to_check = set()
1249
1269
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1270
if next_sub_key not in node:
1251
1271
# This one is just not present in the index at all
1252
1272
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1274
value, refs = node[next_sub_key]
1255
1275
parent_keys = refs[ref_list_num]
1256
1276
parent_map[next_sub_key] = parent_keys
1257
1277
parents_to_check.update(parent_keys)
1264
1284
while parents_to_check:
1265
1285
next_parents_to_check = set()
1266
1286
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1288
value, refs = node[key]
1269
1289
parent_keys = refs[ref_list_num]
1270
1290
parent_map[key] = parent_keys
1271
1291
next_parents_to_check.update(parent_keys)
1333
1353
self._get_root_node()
1334
1354
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1355
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1356
# current breezy) just suck the entire index and iterate in memory.
1338
1358
if self.node_ref_lists:
1339
1359
if self._key_length == 1:
1365
1385
key_dict[key[-1]] = key_value
1366
1386
if self._key_length == 1:
1367
1387
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1388
index._sanity_check_key(self, key)
1374
1390
if self.node_ref_lists:
1375
1391
value, node_refs = nodes[key]
1379
1395
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1398
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1401
def key_count(self):
1419
1402
"""Return an estimate of the number of keys in this index.
1445
1428
signature = bytes[0:len(self._signature())]
1446
1429
if not signature == self._signature():
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1430
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1448
1431
lines = bytes[len(self._signature()):].splitlines()
1449
1432
options_line = lines[0]
1450
1433
if not options_line.startswith(_OPTION_NODE_REFS):
1451
raise errors.BadIndexOptions(self)
1434
raise index.BadIndexOptions(self)
1453
1436
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1454
1437
except ValueError:
1455
raise errors.BadIndexOptions(self)
1438
raise index.BadIndexOptions(self)
1456
1439
options_line = lines[1]
1457
1440
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1458
raise errors.BadIndexOptions(self)
1441
raise index.BadIndexOptions(self)
1460
1443
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1461
1444
except ValueError:
1462
raise errors.BadIndexOptions(self)
1445
raise index.BadIndexOptions(self)
1463
1446
options_line = lines[2]
1464
1447
if not options_line.startswith(_OPTION_LEN):
1465
raise errors.BadIndexOptions(self)
1448
raise index.BadIndexOptions(self)
1467
1450
self._key_count = int(options_line[len(_OPTION_LEN):])
1468
1451
except ValueError:
1469
raise errors.BadIndexOptions(self)
1452
raise index.BadIndexOptions(self)
1470
1453
options_line = lines[3]
1471
1454
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
raise errors.BadIndexOptions(self)
1455
raise index.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1457
self._row_lengths = [int(length) for length in
1458
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1460
except ValueError:
1478
raise errors.BadIndexOptions(self)
1461
raise index.BadIndexOptions(self)
1479
1462
self._compute_row_offsets()
1481
1464
# calculate the bytes we have processed
1514
1497
self._size = num_bytes - base_offset
1515
1498
# the whole thing should be parsed out of 'bytes'
1516
1499
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1500
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1503
if offset > self._size:
1546
1529
bytes = zlib.decompress(data)
1547
1530
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1531
node = self._leaf_factory(bytes, self._key_length,
1532
self.node_ref_lists)
1549
1533
elif bytes.startswith(_INTERNAL_FLAG):
1550
1534
node = _InternalNode(bytes)
1552
1536
raise AssertionError("Unknown node type for %r" % bytes)
1553
yield offset / _PAGE_SIZE, node
1537
yield offset // _PAGE_SIZE, node
1555
1539
def _signature(self):
1556
1540
"""The file signature for this index type."""
1566
1550
# We shouldn't be reading anything anyway
1568
1552
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1553
for node in self._read_nodes(list(range(start_node, node_end))):
1557
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1560
from . import _btree_serializer_pyx as _btree_serializer
1561
_gcchk_factory = _btree_serializer._parse_into_chk
1562
except ImportError as e:
1576
1563
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1564
from . import _btree_serializer_py as _btree_serializer