1
# Copyright (C) 2008-2011 Canonical Ltd
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
21
from bisect import bisect_right
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
60
47
_RESERVED_HEADER_BYTES = 120
81
68
def finish_node(self, pad=True):
82
69
byte_lines, _, padding = self.writer.finish()
83
70
if self.nodes == 0:
84
self.spool = BytesIO()
71
self.spool = cStringIO.StringIO()
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
87
74
elif self.nodes == 1:
88
75
# We got bigger than 1 node, switch to a temp file
89
76
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
171
158
:param references: An iterable of iterables of keys. Each is a
172
159
reference to another key.
173
160
:param value: The value to associate with the key. It may be any
174
bytes as long as it does not contain \\0 or \\n.
161
bytes as long as it does not contain \0 or \n.
176
163
# Ensure that 'key' is a StaticTuple
177
164
key = static_tuple.StaticTuple.from_sequence(key).intern()
178
165
# we don't care about absent_references
179
166
node_refs, _ = self._check_key_ref_value(key, references, value)
180
167
if key in self._nodes:
181
raise index.BadIndexDuplicateKey(key, self)
168
raise errors.BadIndexDuplicateKey(key, self)
182
169
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
183
170
if self._nodes_by_key is not None and self._key_length > 1:
184
171
self._update_nodes_by_key(key, value, node_refs)
206
193
new_backing_file, size = self._spill_mem_keys_without_combining()
207
194
# Note: The transport here isn't strictly needed, because we will use
208
195
# direct access to the new_backing._file object
209
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
211
197
# GC will clean up the file
212
198
new_backing._file = new_backing_file
213
199
if self._combine_backing_indices:
284
270
# undecorate back to (pos, node)
285
271
selected = selected[1]
286
272
if last == selected[1][1]:
287
raise index.BadIndexDuplicateKey(last, self)
273
raise errors.BadIndexDuplicateKey(last, self)
288
274
last = selected[1][1]
289
275
# Yield, with self as the index
290
276
yield (self,) + selected[1][1:]
291
277
pos = selected[0]
293
current_values[pos] = next(iterators_to_combine[pos])
279
current_values[pos] = iterators_to_combine[pos].next()
294
280
except StopIteration:
295
281
current_values[pos] = None
303
289
flag when writing out. This is used by the _spill_mem_keys_to_disk
307
292
if rows[-1].writer is None:
308
293
# opening a new leaf chunk;
310
294
for pos, internal_row in enumerate(rows[:-1]):
311
295
# flesh out any internal nodes that are needed to
312
296
# preserve the height of the tree
322
306
optimize_for_size=optimize_for_size)
323
307
internal_row.writer.write(_INTERNAL_FLAG)
324
308
internal_row.writer.write(_INTERNAL_OFFSET +
325
b"%d\n" % rows[pos + 1].nodes)
309
str(rows[pos + 1].nodes) + "\n")
327
311
length = _PAGE_SIZE
328
312
if rows[-1].nodes == 0:
331
315
optimize_for_size=self._optimize_for_size)
332
316
rows[-1].writer.write(_LEAF_FLAG)
333
317
if rows[-1].writer.write(line):
334
# if we failed to write, despite having an empty page to write to,
335
# then line is too big. raising the error avoids infinite recursion
336
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
339
318
# this key did not fit in the node:
340
319
rows[-1].finish_node()
341
key_line = string_key + b"\n"
320
key_line = string_key + "\n"
343
322
for row in reversed(rows[:-1]):
344
323
# Mark the start of the next node in the node above. If it
366
345
optimize_for_size=self._optimize_for_size)
367
346
new_row.writer.write(_INTERNAL_FLAG)
368
347
new_row.writer.write(_INTERNAL_OFFSET +
369
b"%d\n" % (rows[1].nodes - 1))
348
str(rows[1].nodes - 1) + "\n")
370
349
new_row.writer.write(key_line)
371
350
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
404
383
self.reference_lists)
405
384
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
406
385
for row in reversed(rows):
407
pad = (not isinstance(row, _LeafBuilderRow))
386
pad = (type(row) != _LeafBuilderRow)
408
387
row.finish_node(pad=pad)
409
388
lines = [_BTSIGNATURE]
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
413
392
row_lengths = [row.nodes for row in rows]
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
416
394
if row_lengths and row_lengths[-1] > 1:
417
395
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
420
398
result.writelines(lines)
421
399
position = sum(map(len, lines))
434
412
node = row.spool.read(_PAGE_SIZE)
435
413
result.write(node[reserved:])
436
414
if len(node) == _PAGE_SIZE:
437
result.write(b"\x00" * (reserved - position))
415
result.write("\x00" * (reserved - position))
438
416
position = 0 # Only the root row actually has an offset
439
417
copied_len = osutils.pumpfile(row.spool, result)
440
418
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
if not isinstance(row, _LeafBuilderRow):
419
if type(row) != _LeafBuilderRow:
442
420
raise AssertionError("Incorrect amount of data copied"
443
421
" expected: %d, got: %d"
444
422
% ((row.nodes - 1) * _PAGE_SIZE,
544
524
yield (self,) + node[1:]
545
525
if self._key_length == 1:
547
index._sanity_check_key(self, key)
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
549
533
node = self._nodes[key]
555
539
yield self, key, node[1]
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
561
574
def _get_nodes_by_key(self):
562
575
if self._nodes_by_key is None:
563
576
nodes_by_key = {}
564
577
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
578
for key, (references, value) in self._nodes.iteritems():
566
579
key_dict = nodes_by_key
567
580
for subkey in key[:-1]:
568
581
key_dict = key_dict.setdefault(subkey, {})
569
582
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
584
for key, (references, value) in self._nodes.iteritems():
572
585
key_dict = nodes_by_key
573
586
for subkey in key[:-1]:
574
587
key_dict = key_dict.setdefault(subkey, {})
587
600
def validate(self):
588
601
"""In memory index's have no known corruption at the moment."""
590
def __lt__(self, other):
591
if isinstance(other, type(self)):
592
return self._nodes < other._nodes
593
# Always sort existing indexes before ones that are still being built.
594
if isinstance(other, BTreeGraphIndex):
599
class _LeafNode(dict):
604
class _LeafNode(object):
600
605
"""A leaf node for a serialised B+Tree index."""
602
__slots__ = ('min_key', 'max_key', '_keys')
607
__slots__ = ('keys', 'min_key', 'max_key')
604
609
def __init__(self, bytes, key_length, ref_list_length):
605
610
"""Parse bytes to create a leaf node object."""
611
616
self.max_key = key_list[-1][0]
613
618
self.min_key = self.max_key = None
614
super(_LeafNode, self).__init__(key_list)
615
self._keys = dict(self)
618
"""Return a sorted list of (key, (value, refs)) items"""
619
items = sorted(self.items())
623
"""Return a sorted list of all keys."""
624
keys = sorted(self.keys())
619
self.keys = dict(key_list)
628
622
class _InternalNode(object):
633
627
def __init__(self, bytes):
634
628
"""Parse bytes to create an internal node object."""
635
629
# splitlines mangles the \r delimiters.. don't use it.
636
self.keys = self._parse_lines(bytes.split(b'\n'))
630
self.keys = self._parse_lines(bytes.split('\n'))
638
632
def _parse_lines(self, lines):
640
634
self.offset = int(lines[1][7:])
641
635
as_st = static_tuple.StaticTuple.from_sequence
642
636
for line in lines[2:]:
645
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
646
# to recheck performance and perhaps adapt StaticTuple to adjust.
647
nodes.append(as_st(line.split(b'\0')).intern())
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
679
671
self._recommended_pages = self._compute_recommended_pages()
680
672
self._root_node = None
681
673
self._base_offset = offset
682
self._leaf_factory = _LeafNode
683
674
# Default max size is 100,000 leave values
684
675
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
685
676
if unlimited_cache:
695
686
self._row_lengths = None
696
687
self._row_offsets = None # Start of each row, [-1] is the end
701
689
def __eq__(self, other):
702
690
"""Equal when self and other were created with the same parameters."""
704
isinstance(self, type(other)) and
692
type(self) == type(other) and
705
693
self._transport == other._transport and
706
694
self._name == other._name and
707
695
self._size == other._size)
709
def __lt__(self, other):
710
if isinstance(other, type(self)):
711
return ((self._transport, self._name, self._size) <
712
(other._transport, other._name, other._size))
713
# Always sort existing indexes before ones that are still being built.
714
if isinstance(other, BTreeBuilder):
718
697
def __ne__(self, other):
719
698
return not self.__eq__(other)
753
732
pages fit in that length.
755
734
recommended_read = self._transport.recommended_page_size()
756
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
735
recommended_pages = int(math.ceil(recommended_read /
757
737
return recommended_pages
759
739
def _compute_total_pages_in_index(self):
770
750
return self._row_offsets[-1]
771
751
# This is the number of pages as defined by the size of the index. They
772
752
# should be indentical.
773
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
774
754
return total_pages
776
756
def _expand_offsets(self, offsets):
807
787
if total_pages - len(cached_offsets) <= self._recommended_pages:
808
788
# Read whatever is left
809
789
if cached_offsets:
810
expanded = [x for x in range(total_pages)
790
expanded = [x for x in xrange(total_pages)
811
791
if x not in cached_offsets]
813
expanded = list(range(total_pages))
793
expanded = range(total_pages)
814
794
if 'index' in debug.debug_flags:
815
795
trace.mutter(' reading all unread pages: %s', expanded)
930
910
def _get_offsets_to_cached_pages(self):
931
911
"""Determine what nodes we already have cached."""
932
cached_offsets = set(self._internal_node_cache)
933
# cache may be dict or LRUCache, keys() is the common method
912
cached_offsets = set(self._internal_node_cache.keys())
934
913
cached_offsets.update(self._leaf_node_cache.keys())
935
914
if self._root_node is not None:
936
915
cached_offsets.add(0)
969
948
def _cache_leaf_values(self, nodes):
970
949
"""Cache directly from key => value, skipping the btree."""
971
950
if self._leaf_value_cache is not None:
972
for node in viewvalues(nodes):
973
for key, value in node.all_items():
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
974
953
if key in self._leaf_value_cache:
975
954
# Don't add the rest of the keys, we've seen this node
1000
979
if self._row_offsets[-1] == 1:
1001
980
# There is only the root node, and we read that via key_count()
1002
981
if self.node_ref_lists:
1003
for key, (value, refs) in self._root_node.all_items():
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
1004
983
yield (self, key, value, refs)
1006
for key, (value, refs) in self._root_node.all_items():
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1007
986
yield (self, key, value)
1009
988
start_of_leaves = self._row_offsets[-2]
1010
989
end_of_leaves = self._row_offsets[-1]
1011
needed_offsets = list(range(start_of_leaves, end_of_leaves))
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1012
991
if needed_offsets == [0]:
1013
992
# Special case when we only have a root node, as we have already
1014
993
# read everything
1019
998
# for spilling index builds to disk.
1020
999
if self.node_ref_lists:
1021
1000
for _, node in nodes:
1022
for key, (value, refs) in node.all_items():
1001
for key, (value, refs) in sorted(node.keys.items()):
1023
1002
yield (self, key, value, refs)
1025
1004
for _, node in nodes:
1026
for key, (value, refs) in node.all_items():
1005
for key, (value, refs) in sorted(node.keys.items()):
1027
1006
yield (self, key, value)
1053
1032
# iter_steps = len(in_keys) + len(fixed_keys)
1054
1033
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1055
1034
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1056
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1057
1036
# elif bisect_steps < iter_steps:
1059
1038
# for key in in_keys:
1062
1041
# return [(o, offsets[o]) for o in sorted(offsets)]
1063
1042
in_keys_iter = iter(in_keys)
1064
1043
fixed_keys_iter = enumerate(fixed_keys)
1065
cur_in_key = next(in_keys_iter)
1066
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1068
1047
class InputDone(Exception): pass
1069
1048
class FixedDone(Exception): pass
1085
1064
while cur_in_key < cur_fixed_key:
1086
1065
cur_keys.append(cur_in_key)
1088
cur_in_key = next(in_keys_iter)
1067
cur_in_key = in_keys_iter.next()
1089
1068
except StopIteration:
1090
1069
raise InputDone
1091
1070
# At this point cur_in_key must be >= cur_fixed_key
1191
1170
node = nodes[node_index]
1192
1171
for next_sub_key in sub_keys:
1193
if next_sub_key in node:
1194
value, refs = node[next_sub_key]
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1195
1174
if self.node_ref_lists:
1196
1175
yield (self, next_sub_key, value, refs)
1265
1244
# sub_keys is all of the keys we are looking for that should exist
1266
1245
# on this page, if they aren't here, then they won't be found
1267
1246
node = nodes[node_index]
1247
node_keys = node.keys
1268
1248
parents_to_check = set()
1269
1249
for next_sub_key in sub_keys:
1270
if next_sub_key not in node:
1250
if next_sub_key not in node_keys:
1271
1251
# This one is just not present in the index at all
1272
1252
missing_keys.add(next_sub_key)
1274
value, refs = node[next_sub_key]
1254
value, refs = node_keys[next_sub_key]
1275
1255
parent_keys = refs[ref_list_num]
1276
1256
parent_map[next_sub_key] = parent_keys
1277
1257
parents_to_check.update(parent_keys)
1284
1264
while parents_to_check:
1285
1265
next_parents_to_check = set()
1286
1266
for key in parents_to_check:
1288
value, refs = node[key]
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1289
1269
parent_keys = refs[ref_list_num]
1290
1270
parent_map[key] = parent_keys
1291
1271
next_parents_to_check.update(parent_keys)
1353
1333
self._get_root_node()
1354
1334
# TODO: only access nodes that can satisfy the prefixes we are looking
1355
1335
# for. For now, to meet API usage (as this function is not used by
1356
# current breezy) just suck the entire index and iterate in memory.
1336
# current bzrlib) just suck the entire index and iterate in memory.
1358
1338
if self.node_ref_lists:
1359
1339
if self._key_length == 1:
1385
1365
key_dict[key[-1]] = key_value
1386
1366
if self._key_length == 1:
1387
1367
for key in keys:
1388
index._sanity_check_key(self, key)
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1390
1374
if self.node_ref_lists:
1391
1375
value, node_refs = nodes[key]
1395
1379
except KeyError:
1398
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1401
1418
def key_count(self):
1402
1419
"""Return an estimate of the number of keys in this index.
1428
1445
signature = bytes[0:len(self._signature())]
1429
1446
if not signature == self._signature():
1430
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1431
1448
lines = bytes[len(self._signature()):].splitlines()
1432
1449
options_line = lines[0]
1433
1450
if not options_line.startswith(_OPTION_NODE_REFS):
1434
raise index.BadIndexOptions(self)
1451
raise errors.BadIndexOptions(self)
1436
1453
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1437
1454
except ValueError:
1438
raise index.BadIndexOptions(self)
1455
raise errors.BadIndexOptions(self)
1439
1456
options_line = lines[1]
1440
1457
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1441
raise index.BadIndexOptions(self)
1458
raise errors.BadIndexOptions(self)
1443
1460
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1444
1461
except ValueError:
1445
raise index.BadIndexOptions(self)
1462
raise errors.BadIndexOptions(self)
1446
1463
options_line = lines[2]
1447
1464
if not options_line.startswith(_OPTION_LEN):
1448
raise index.BadIndexOptions(self)
1465
raise errors.BadIndexOptions(self)
1450
1467
self._key_count = int(options_line[len(_OPTION_LEN):])
1451
1468
except ValueError:
1452
raise index.BadIndexOptions(self)
1469
raise errors.BadIndexOptions(self)
1453
1470
options_line = lines[3]
1454
1471
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1455
raise index.BadIndexOptions(self)
1472
raise errors.BadIndexOptions(self)
1457
self._row_lengths = [int(length) for length in
1458
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1460
1477
except ValueError:
1461
raise index.BadIndexOptions(self)
1478
raise errors.BadIndexOptions(self)
1462
1479
self._compute_row_offsets()
1464
1481
# calculate the bytes we have processed
1497
1514
self._size = num_bytes - base_offset
1498
1515
# the whole thing should be parsed out of 'bytes'
1499
1516
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1500
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1503
1520
if offset > self._size:
1529
1546
bytes = zlib.decompress(data)
1530
1547
if bytes.startswith(_LEAF_FLAG):
1531
node = self._leaf_factory(bytes, self._key_length,
1532
self.node_ref_lists)
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1533
1549
elif bytes.startswith(_INTERNAL_FLAG):
1534
1550
node = _InternalNode(bytes)
1536
1552
raise AssertionError("Unknown node type for %r" % bytes)
1537
yield offset // _PAGE_SIZE, node
1553
yield offset / _PAGE_SIZE, node
1539
1555
def _signature(self):
1540
1556
"""The file signature for this index type."""
1550
1566
# We shouldn't be reading anything anyway
1552
1568
node_end = self._row_offsets[-1]
1553
for node in self._read_nodes(list(range(start_node, node_end))):
1569
for node in self._read_nodes(range(start_node, node_end)):
1557
_gcchk_factory = _LeafNode
1560
from . import _btree_serializer_pyx as _btree_serializer
1561
_gcchk_factory = _btree_serializer._parse_into_chk
1562
except ImportError as e:
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1563
1576
osutils.failed_to_load_extension(e)
1564
from . import _btree_serializer_py as _btree_serializer
1577
from bzrlib import _btree_serializer_py as _btree_serializer