1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
47
60
_RESERVED_HEADER_BYTES = 120
68
81
def finish_node(self, pad=True):
69
82
byte_lines, _, padding = self.writer.finish()
70
83
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
84
self.spool = BytesIO()
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
74
87
elif self.nodes == 1:
75
88
# We got bigger than 1 node, switch to a temp file
76
89
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
158
171
:param references: An iterable of iterables of keys. Each is a
159
172
reference to another key.
160
173
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
174
bytes as long as it does not contain \\0 or \\n.
163
176
# Ensure that 'key' is a StaticTuple
164
177
key = static_tuple.StaticTuple.from_sequence(key).intern()
165
178
# we don't care about absent_references
166
179
node_refs, _ = self._check_key_ref_value(key, references, value)
167
180
if key in self._nodes:
168
raise errors.BadIndexDuplicateKey(key, self)
181
raise index.BadIndexDuplicateKey(key, self)
169
182
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
170
183
if self._nodes_by_key is not None and self._key_length > 1:
171
184
self._update_nodes_by_key(key, value, node_refs)
193
206
new_backing_file, size = self._spill_mem_keys_without_combining()
194
207
# Note: The transport here isn't strictly needed, because we will use
195
208
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
209
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
211
# GC will clean up the file
198
212
new_backing._file = new_backing_file
199
213
if self._combine_backing_indices:
270
284
# undecorate back to (pos, node)
271
285
selected = selected[1]
272
286
if last == selected[1][1]:
273
raise errors.BadIndexDuplicateKey(last, self)
287
raise index.BadIndexDuplicateKey(last, self)
274
288
last = selected[1][1]
275
289
# Yield, with self as the index
276
290
yield (self,) + selected[1][1:]
277
291
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
293
current_values[pos] = next(iterators_to_combine[pos])
280
294
except StopIteration:
281
295
current_values[pos] = None
289
303
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
307
if rows[-1].writer is None:
293
308
# opening a new leaf chunk;
294
310
for pos, internal_row in enumerate(rows[:-1]):
295
311
# flesh out any internal nodes that are needed to
296
312
# preserve the height of the tree
306
322
optimize_for_size=optimize_for_size)
307
323
internal_row.writer.write(_INTERNAL_FLAG)
308
324
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
325
b"%d\n" % rows[pos + 1].nodes)
311
327
length = _PAGE_SIZE
312
328
if rows[-1].nodes == 0:
315
331
optimize_for_size=self._optimize_for_size)
316
332
rows[-1].writer.write(_LEAF_FLAG)
317
333
if rows[-1].writer.write(line):
334
# if we failed to write, despite having an empty page to write to,
335
# then line is too big. raising the error avoids infinite recursion
336
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
318
339
# this key did not fit in the node:
319
340
rows[-1].finish_node()
320
key_line = string_key + "\n"
341
key_line = string_key + b"\n"
322
343
for row in reversed(rows[:-1]):
323
344
# Mark the start of the next node in the node above. If it
345
366
optimize_for_size=self._optimize_for_size)
346
367
new_row.writer.write(_INTERNAL_FLAG)
347
368
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
369
b"%d\n" % (rows[1].nodes - 1))
349
370
new_row.writer.write(key_line)
350
371
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
383
404
self.reference_lists)
384
405
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
406
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
407
pad = (not isinstance(row, _LeafBuilderRow))
387
408
row.finish_node(pad=pad)
388
409
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
413
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
394
416
if row_lengths and row_lengths[-1] > 1:
395
417
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
420
result.writelines(lines)
399
421
position = sum(map(len, lines))
412
434
node = row.spool.read(_PAGE_SIZE)
413
435
result.write(node[reserved:])
414
436
if len(node) == _PAGE_SIZE:
415
result.write("\x00" * (reserved - position))
437
result.write(b"\x00" * (reserved - position))
416
438
position = 0 # Only the root row actually has an offset
417
439
copied_len = osutils.pumpfile(row.spool, result)
418
440
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
441
if not isinstance(row, _LeafBuilderRow):
420
442
raise AssertionError("Incorrect amount of data copied"
421
443
" expected: %d, got: %d"
422
444
% ((row.nodes - 1) * _PAGE_SIZE,
524
544
yield (self,) + node[1:]
525
545
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
547
index._sanity_check_key(self, key)
533
549
node = self._nodes[key]
539
555
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
561
def _get_nodes_by_key(self):
575
562
if self._nodes_by_key is None:
576
563
nodes_by_key = {}
577
564
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
565
for key, (references, value) in viewitems(self._nodes):
579
566
key_dict = nodes_by_key
580
567
for subkey in key[:-1]:
581
568
key_dict = key_dict.setdefault(subkey, {})
582
569
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
571
for key, (references, value) in viewitems(self._nodes):
585
572
key_dict = nodes_by_key
586
573
for subkey in key[:-1]:
587
574
key_dict = key_dict.setdefault(subkey, {})
600
587
def validate(self):
601
588
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
590
def __lt__(self, other):
591
if isinstance(other, type(self)):
592
return self._nodes < other._nodes
593
# Always sort existing indexes before ones that are still being built.
594
if isinstance(other, BTreeGraphIndex):
599
class _LeafNode(dict):
605
600
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
602
__slots__ = ('min_key', 'max_key', '_keys')
609
604
def __init__(self, bytes, key_length, ref_list_length):
610
605
"""Parse bytes to create a leaf node object."""
616
611
self.max_key = key_list[-1][0]
618
613
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
614
super(_LeafNode, self).__init__(key_list)
615
self._keys = dict(self)
618
"""Return a sorted list of (key, (value, refs)) items"""
619
items = sorted(self.items())
623
"""Return a sorted list of all keys."""
624
keys = sorted(self.keys())
622
628
class _InternalNode(object):
627
633
def __init__(self, bytes):
628
634
"""Parse bytes to create an internal node object."""
629
635
# splitlines mangles the \r delimiters.. don't use it.
630
self.keys = self._parse_lines(bytes.split('\n'))
636
self.keys = self._parse_lines(bytes.split(b'\n'))
632
638
def _parse_lines(self, lines):
634
640
self.offset = int(lines[1][7:])
635
641
as_st = static_tuple.StaticTuple.from_sequence
636
642
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
645
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
646
# to recheck performance and perhaps adapt StaticTuple to adjust.
647
nodes.append(as_st(line.split(b'\0')).intern())
671
679
self._recommended_pages = self._compute_recommended_pages()
672
680
self._root_node = None
673
681
self._base_offset = offset
682
self._leaf_factory = _LeafNode
674
683
# Default max size is 100,000 leave values
675
684
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
685
if unlimited_cache:
686
695
self._row_lengths = None
687
696
self._row_offsets = None # Start of each row, [-1] is the end
689
701
def __eq__(self, other):
690
702
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
704
isinstance(self, type(other)) and
693
705
self._transport == other._transport and
694
706
self._name == other._name and
695
707
self._size == other._size)
709
def __lt__(self, other):
710
if isinstance(other, type(self)):
711
return ((self._name, self._size) < (other._name, other._size))
712
# Always sort existing indexes before ones that are still being built.
713
if isinstance(other, BTreeBuilder):
697
717
def __ne__(self, other):
698
718
return not self.__eq__(other)
732
752
pages fit in that length.
734
754
recommended_read = self._transport.recommended_page_size()
735
recommended_pages = int(math.ceil(recommended_read /
755
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
737
756
return recommended_pages
739
758
def _compute_total_pages_in_index(self):
750
769
return self._row_offsets[-1]
751
770
# This is the number of pages as defined by the size of the index. They
752
771
# should be indentical.
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
772
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
754
773
return total_pages
756
775
def _expand_offsets(self, offsets):
787
806
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
807
# Read whatever is left
789
808
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
809
expanded = [x for x in range(total_pages)
791
810
if x not in cached_offsets]
793
expanded = range(total_pages)
812
expanded = list(range(total_pages))
794
813
if 'index' in debug.debug_flags:
795
814
trace.mutter(' reading all unread pages: %s', expanded)
910
929
def _get_offsets_to_cached_pages(self):
911
930
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
931
cached_offsets = set(self._internal_node_cache)
932
# cache may be dict or LRUCache, keys() is the common method
913
933
cached_offsets.update(self._leaf_node_cache.keys())
914
934
if self._root_node is not None:
915
935
cached_offsets.add(0)
948
968
def _cache_leaf_values(self, nodes):
949
969
"""Cache directly from key => value, skipping the btree."""
950
970
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
971
for node in viewvalues(nodes):
972
for key, value in node.all_items():
953
973
if key in self._leaf_value_cache:
954
974
# Don't add the rest of the keys, we've seen this node
979
999
if self._row_offsets[-1] == 1:
980
1000
# There is only the root node, and we read that via key_count()
981
1001
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
1002
for key, (value, refs) in self._root_node.all_items():
983
1003
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1005
for key, (value, refs) in self._root_node.all_items():
986
1006
yield (self, key, value)
988
1008
start_of_leaves = self._row_offsets[-2]
989
1009
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1010
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
1011
if needed_offsets == [0]:
992
1012
# Special case when we only have a root node, as we have already
993
1013
# read everything
998
1018
# for spilling index builds to disk.
999
1019
if self.node_ref_lists:
1000
1020
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1021
for key, (value, refs) in node.all_items():
1002
1022
yield (self, key, value, refs)
1004
1024
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1025
for key, (value, refs) in node.all_items():
1006
1026
yield (self, key, value)
1032
1052
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1053
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1054
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1055
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1056
# elif bisect_steps < iter_steps:
1038
1058
# for key in in_keys:
1041
1061
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1062
in_keys_iter = iter(in_keys)
1043
1063
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1064
cur_in_key = next(in_keys_iter)
1065
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1067
class InputDone(Exception): pass
1048
1068
class FixedDone(Exception): pass
1064
1084
while cur_in_key < cur_fixed_key:
1065
1085
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1087
cur_in_key = next(in_keys_iter)
1068
1088
except StopIteration:
1069
1089
raise InputDone
1070
1090
# At this point cur_in_key must be >= cur_fixed_key
1170
1190
node = nodes[node_index]
1171
1191
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1192
if next_sub_key in node:
1193
value, refs = node[next_sub_key]
1174
1194
if self.node_ref_lists:
1175
1195
yield (self, next_sub_key, value, refs)
1244
1264
# sub_keys is all of the keys we are looking for that should exist
1245
1265
# on this page, if they aren't here, then they won't be found
1246
1266
node = nodes[node_index]
1247
node_keys = node.keys
1248
1267
parents_to_check = set()
1249
1268
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1269
if next_sub_key not in node:
1251
1270
# This one is just not present in the index at all
1252
1271
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1273
value, refs = node[next_sub_key]
1255
1274
parent_keys = refs[ref_list_num]
1256
1275
parent_map[next_sub_key] = parent_keys
1257
1276
parents_to_check.update(parent_keys)
1264
1283
while parents_to_check:
1265
1284
next_parents_to_check = set()
1266
1285
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1287
value, refs = node[key]
1269
1288
parent_keys = refs[ref_list_num]
1270
1289
parent_map[key] = parent_keys
1271
1290
next_parents_to_check.update(parent_keys)
1333
1352
self._get_root_node()
1334
1353
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1354
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1355
# current breezy) just suck the entire index and iterate in memory.
1338
1357
if self.node_ref_lists:
1339
1358
if self._key_length == 1:
1365
1384
key_dict[key[-1]] = key_value
1366
1385
if self._key_length == 1:
1367
1386
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1387
index._sanity_check_key(self, key)
1374
1389
if self.node_ref_lists:
1375
1390
value, node_refs = nodes[key]
1379
1394
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1397
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1400
def key_count(self):
1419
1401
"""Return an estimate of the number of keys in this index.
1445
1427
signature = bytes[0:len(self._signature())]
1446
1428
if not signature == self._signature():
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1429
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1448
1430
lines = bytes[len(self._signature()):].splitlines()
1449
1431
options_line = lines[0]
1450
1432
if not options_line.startswith(_OPTION_NODE_REFS):
1451
raise errors.BadIndexOptions(self)
1433
raise index.BadIndexOptions(self)
1453
1435
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1454
1436
except ValueError:
1455
raise errors.BadIndexOptions(self)
1437
raise index.BadIndexOptions(self)
1456
1438
options_line = lines[1]
1457
1439
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1458
raise errors.BadIndexOptions(self)
1440
raise index.BadIndexOptions(self)
1460
1442
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1461
1443
except ValueError:
1462
raise errors.BadIndexOptions(self)
1444
raise index.BadIndexOptions(self)
1463
1445
options_line = lines[2]
1464
1446
if not options_line.startswith(_OPTION_LEN):
1465
raise errors.BadIndexOptions(self)
1447
raise index.BadIndexOptions(self)
1467
1449
self._key_count = int(options_line[len(_OPTION_LEN):])
1468
1450
except ValueError:
1469
raise errors.BadIndexOptions(self)
1451
raise index.BadIndexOptions(self)
1470
1452
options_line = lines[3]
1471
1453
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
raise errors.BadIndexOptions(self)
1454
raise index.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1456
self._row_lengths = [int(length) for length in
1457
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1459
except ValueError:
1478
raise errors.BadIndexOptions(self)
1460
raise index.BadIndexOptions(self)
1479
1461
self._compute_row_offsets()
1481
1463
# calculate the bytes we have processed
1514
1496
self._size = num_bytes - base_offset
1515
1497
# the whole thing should be parsed out of 'bytes'
1516
1498
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1499
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1502
if offset > self._size:
1546
1528
bytes = zlib.decompress(data)
1547
1529
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1530
node = self._leaf_factory(bytes, self._key_length,
1531
self.node_ref_lists)
1549
1532
elif bytes.startswith(_INTERNAL_FLAG):
1550
1533
node = _InternalNode(bytes)
1552
1535
raise AssertionError("Unknown node type for %r" % bytes)
1553
yield offset / _PAGE_SIZE, node
1536
yield offset // _PAGE_SIZE, node
1555
1538
def _signature(self):
1556
1539
"""The file signature for this index type."""
1566
1549
# We shouldn't be reading anything anyway
1568
1551
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1552
for node in self._read_nodes(list(range(start_node, node_end))):
1556
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1559
from . import _btree_serializer_pyx as _btree_serializer
1560
_gcchk_factory = _btree_serializer._parse_into_chk
1561
except ImportError as e:
1576
1562
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1563
from . import _btree_serializer_py as _btree_serializer