1
# Copyright (C) 2008-2011 Canonical Ltd
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
21
from bisect import bisect_right
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
60
47
_RESERVED_HEADER_BYTES = 120
81
68
def finish_node(self, pad=True):
82
69
byte_lines, _, padding = self.writer.finish()
83
70
if self.nodes == 0:
84
self.spool = BytesIO()
71
self.spool = cStringIO.StringIO()
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
87
74
elif self.nodes == 1:
88
75
# We got bigger than 1 node, switch to a temp file
89
76
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
171
158
:param references: An iterable of iterables of keys. Each is a
172
159
reference to another key.
173
160
:param value: The value to associate with the key. It may be any
174
bytes as long as it does not contain \\0 or \\n.
161
bytes as long as it does not contain \0 or \n.
176
163
# Ensure that 'key' is a StaticTuple
177
164
key = static_tuple.StaticTuple.from_sequence(key).intern()
178
165
# we don't care about absent_references
179
166
node_refs, _ = self._check_key_ref_value(key, references, value)
180
167
if key in self._nodes:
181
raise index.BadIndexDuplicateKey(key, self)
168
raise errors.BadIndexDuplicateKey(key, self)
182
169
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
183
170
if self._nodes_by_key is not None and self._key_length > 1:
184
171
self._update_nodes_by_key(key, value, node_refs)
206
193
new_backing_file, size = self._spill_mem_keys_without_combining()
207
194
# Note: The transport here isn't strictly needed, because we will use
208
195
# direct access to the new_backing._file object
209
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
211
197
# GC will clean up the file
212
198
new_backing._file = new_backing_file
213
199
if self._combine_backing_indices:
284
270
# undecorate back to (pos, node)
285
271
selected = selected[1]
286
272
if last == selected[1][1]:
287
raise index.BadIndexDuplicateKey(last, self)
273
raise errors.BadIndexDuplicateKey(last, self)
288
274
last = selected[1][1]
289
275
# Yield, with self as the index
290
276
yield (self,) + selected[1][1:]
291
277
pos = selected[0]
293
current_values[pos] = next(iterators_to_combine[pos])
279
current_values[pos] = iterators_to_combine[pos].next()
294
280
except StopIteration:
295
281
current_values[pos] = None
303
289
flag when writing out. This is used by the _spill_mem_keys_to_disk
307
292
if rows[-1].writer is None:
308
293
# opening a new leaf chunk;
310
294
for pos, internal_row in enumerate(rows[:-1]):
311
295
# flesh out any internal nodes that are needed to
312
296
# preserve the height of the tree
322
306
optimize_for_size=optimize_for_size)
323
307
internal_row.writer.write(_INTERNAL_FLAG)
324
308
internal_row.writer.write(_INTERNAL_OFFSET +
325
b"%d\n" % rows[pos + 1].nodes)
309
str(rows[pos + 1].nodes) + "\n")
327
311
length = _PAGE_SIZE
328
312
if rows[-1].nodes == 0:
331
315
optimize_for_size=self._optimize_for_size)
332
316
rows[-1].writer.write(_LEAF_FLAG)
333
317
if rows[-1].writer.write(line):
334
# if we failed to write, despite having an empty page to write to,
335
# then line is too big. raising the error avoids infinite recursion
336
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
339
318
# this key did not fit in the node:
340
319
rows[-1].finish_node()
341
key_line = string_key + b"\n"
320
key_line = string_key + "\n"
343
322
for row in reversed(rows[:-1]):
344
323
# Mark the start of the next node in the node above. If it
366
345
optimize_for_size=self._optimize_for_size)
367
346
new_row.writer.write(_INTERNAL_FLAG)
368
347
new_row.writer.write(_INTERNAL_OFFSET +
369
b"%d\n" % (rows[1].nodes - 1))
348
str(rows[1].nodes - 1) + "\n")
370
349
new_row.writer.write(key_line)
371
350
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
404
383
self.reference_lists)
405
384
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
406
385
for row in reversed(rows):
407
pad = (not isinstance(row, _LeafBuilderRow))
386
pad = (type(row) != _LeafBuilderRow)
408
387
row.finish_node(pad=pad)
409
388
lines = [_BTSIGNATURE]
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
413
392
row_lengths = [row.nodes for row in rows]
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
416
394
if row_lengths and row_lengths[-1] > 1:
417
395
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
420
398
result.writelines(lines)
421
399
position = sum(map(len, lines))
434
412
node = row.spool.read(_PAGE_SIZE)
435
413
result.write(node[reserved:])
436
414
if len(node) == _PAGE_SIZE:
437
result.write(b"\x00" * (reserved - position))
415
result.write("\x00" * (reserved - position))
438
416
position = 0 # Only the root row actually has an offset
439
417
copied_len = osutils.pumpfile(row.spool, result)
440
418
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
if not isinstance(row, _LeafBuilderRow):
419
if type(row) != _LeafBuilderRow:
442
420
raise AssertionError("Incorrect amount of data copied"
443
421
" expected: %d, got: %d"
444
422
% ((row.nodes - 1) * _PAGE_SIZE,
544
524
yield (self,) + node[1:]
545
525
if self._key_length == 1:
547
index._sanity_check_key(self, key)
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
549
533
node = self._nodes[key]
555
539
yield self, key, node[1]
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
561
574
def _get_nodes_by_key(self):
562
575
if self._nodes_by_key is None:
563
576
nodes_by_key = {}
564
577
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
578
for key, (references, value) in self._nodes.iteritems():
566
579
key_dict = nodes_by_key
567
580
for subkey in key[:-1]:
568
581
key_dict = key_dict.setdefault(subkey, {})
569
582
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
584
for key, (references, value) in self._nodes.iteritems():
572
585
key_dict = nodes_by_key
573
586
for subkey in key[:-1]:
574
587
key_dict = key_dict.setdefault(subkey, {})
587
600
def validate(self):
588
601
"""In memory index's have no known corruption at the moment."""
590
def __lt__(self, other):
591
if isinstance(other, type(self)):
592
return self._nodes < other._nodes
593
# Always sort existing indexes before ones that are still being built.
594
if isinstance(other, BTreeGraphIndex):
599
class _LeafNode(dict):
604
class _LeafNode(object):
600
605
"""A leaf node for a serialised B+Tree index."""
602
__slots__ = ('min_key', 'max_key', '_keys')
607
__slots__ = ('keys', 'min_key', 'max_key')
604
609
def __init__(self, bytes, key_length, ref_list_length):
605
610
"""Parse bytes to create a leaf node object."""
611
616
self.max_key = key_list[-1][0]
613
618
self.min_key = self.max_key = None
614
super(_LeafNode, self).__init__(key_list)
615
self._keys = dict(self)
618
"""Return a sorted list of (key, (value, refs)) items"""
619
items = sorted(self.items())
623
"""Return a sorted list of all keys."""
624
keys = sorted(self.keys())
619
self.keys = dict(key_list)
628
622
class _InternalNode(object):
633
627
def __init__(self, bytes):
634
628
"""Parse bytes to create an internal node object."""
635
629
# splitlines mangles the \r delimiters.. don't use it.
636
self.keys = self._parse_lines(bytes.split(b'\n'))
630
self.keys = self._parse_lines(bytes.split('\n'))
638
632
def _parse_lines(self, lines):
640
634
self.offset = int(lines[1][7:])
641
635
as_st = static_tuple.StaticTuple.from_sequence
642
636
for line in lines[2:]:
645
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
646
# to recheck performance and perhaps adapt StaticTuple to adjust.
647
nodes.append(as_st(line.split(b'\0')).intern())
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
679
671
self._recommended_pages = self._compute_recommended_pages()
680
672
self._root_node = None
681
673
self._base_offset = offset
682
self._leaf_factory = _LeafNode
683
674
# Default max size is 100,000 leave values
684
675
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
685
676
if unlimited_cache:
695
686
self._row_lengths = None
696
687
self._row_offsets = None # Start of each row, [-1] is the end
701
689
def __eq__(self, other):
702
690
"""Equal when self and other were created with the same parameters."""
704
isinstance(self, type(other)) and
692
type(self) == type(other) and
705
693
self._transport == other._transport and
706
694
self._name == other._name and
707
695
self._size == other._size)
709
def __lt__(self, other):
710
if isinstance(other, type(self)):
711
return ((self._name, self._size) < (other._name, other._size))
712
# Always sort existing indexes before ones that are still being built.
713
if isinstance(other, BTreeBuilder):
717
697
def __ne__(self, other):
718
698
return not self.__eq__(other)
752
732
pages fit in that length.
754
734
recommended_read = self._transport.recommended_page_size()
755
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
735
recommended_pages = int(math.ceil(recommended_read /
756
737
return recommended_pages
758
739
def _compute_total_pages_in_index(self):
769
750
return self._row_offsets[-1]
770
751
# This is the number of pages as defined by the size of the index. They
771
752
# should be indentical.
772
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
773
754
return total_pages
775
756
def _expand_offsets(self, offsets):
806
787
if total_pages - len(cached_offsets) <= self._recommended_pages:
807
788
# Read whatever is left
808
789
if cached_offsets:
809
expanded = [x for x in range(total_pages)
790
expanded = [x for x in xrange(total_pages)
810
791
if x not in cached_offsets]
812
expanded = list(range(total_pages))
793
expanded = range(total_pages)
813
794
if 'index' in debug.debug_flags:
814
795
trace.mutter(' reading all unread pages: %s', expanded)
929
910
def _get_offsets_to_cached_pages(self):
930
911
"""Determine what nodes we already have cached."""
931
cached_offsets = set(self._internal_node_cache)
932
# cache may be dict or LRUCache, keys() is the common method
912
cached_offsets = set(self._internal_node_cache.keys())
933
913
cached_offsets.update(self._leaf_node_cache.keys())
934
914
if self._root_node is not None:
935
915
cached_offsets.add(0)
968
948
def _cache_leaf_values(self, nodes):
969
949
"""Cache directly from key => value, skipping the btree."""
970
950
if self._leaf_value_cache is not None:
971
for node in viewvalues(nodes):
972
for key, value in node.all_items():
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
973
953
if key in self._leaf_value_cache:
974
954
# Don't add the rest of the keys, we've seen this node
999
979
if self._row_offsets[-1] == 1:
1000
980
# There is only the root node, and we read that via key_count()
1001
981
if self.node_ref_lists:
1002
for key, (value, refs) in self._root_node.all_items():
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
1003
983
yield (self, key, value, refs)
1005
for key, (value, refs) in self._root_node.all_items():
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1006
986
yield (self, key, value)
1008
988
start_of_leaves = self._row_offsets[-2]
1009
989
end_of_leaves = self._row_offsets[-1]
1010
needed_offsets = list(range(start_of_leaves, end_of_leaves))
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1011
991
if needed_offsets == [0]:
1012
992
# Special case when we only have a root node, as we have already
1013
993
# read everything
1018
998
# for spilling index builds to disk.
1019
999
if self.node_ref_lists:
1020
1000
for _, node in nodes:
1021
for key, (value, refs) in node.all_items():
1001
for key, (value, refs) in sorted(node.keys.items()):
1022
1002
yield (self, key, value, refs)
1024
1004
for _, node in nodes:
1025
for key, (value, refs) in node.all_items():
1005
for key, (value, refs) in sorted(node.keys.items()):
1026
1006
yield (self, key, value)
1052
1032
# iter_steps = len(in_keys) + len(fixed_keys)
1053
1033
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1054
1034
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1055
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1056
1036
# elif bisect_steps < iter_steps:
1058
1038
# for key in in_keys:
1061
1041
# return [(o, offsets[o]) for o in sorted(offsets)]
1062
1042
in_keys_iter = iter(in_keys)
1063
1043
fixed_keys_iter = enumerate(fixed_keys)
1064
cur_in_key = next(in_keys_iter)
1065
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1067
1047
class InputDone(Exception): pass
1068
1048
class FixedDone(Exception): pass
1084
1064
while cur_in_key < cur_fixed_key:
1085
1065
cur_keys.append(cur_in_key)
1087
cur_in_key = next(in_keys_iter)
1067
cur_in_key = in_keys_iter.next()
1088
1068
except StopIteration:
1089
1069
raise InputDone
1090
1070
# At this point cur_in_key must be >= cur_fixed_key
1190
1170
node = nodes[node_index]
1191
1171
for next_sub_key in sub_keys:
1192
if next_sub_key in node:
1193
value, refs = node[next_sub_key]
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1194
1174
if self.node_ref_lists:
1195
1175
yield (self, next_sub_key, value, refs)
1264
1244
# sub_keys is all of the keys we are looking for that should exist
1265
1245
# on this page, if they aren't here, then they won't be found
1266
1246
node = nodes[node_index]
1247
node_keys = node.keys
1267
1248
parents_to_check = set()
1268
1249
for next_sub_key in sub_keys:
1269
if next_sub_key not in node:
1250
if next_sub_key not in node_keys:
1270
1251
# This one is just not present in the index at all
1271
1252
missing_keys.add(next_sub_key)
1273
value, refs = node[next_sub_key]
1254
value, refs = node_keys[next_sub_key]
1274
1255
parent_keys = refs[ref_list_num]
1275
1256
parent_map[next_sub_key] = parent_keys
1276
1257
parents_to_check.update(parent_keys)
1283
1264
while parents_to_check:
1284
1265
next_parents_to_check = set()
1285
1266
for key in parents_to_check:
1287
value, refs = node[key]
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1288
1269
parent_keys = refs[ref_list_num]
1289
1270
parent_map[key] = parent_keys
1290
1271
next_parents_to_check.update(parent_keys)
1352
1333
self._get_root_node()
1353
1334
# TODO: only access nodes that can satisfy the prefixes we are looking
1354
1335
# for. For now, to meet API usage (as this function is not used by
1355
# current breezy) just suck the entire index and iterate in memory.
1336
# current bzrlib) just suck the entire index and iterate in memory.
1357
1338
if self.node_ref_lists:
1358
1339
if self._key_length == 1:
1384
1365
key_dict[key[-1]] = key_value
1385
1366
if self._key_length == 1:
1386
1367
for key in keys:
1387
index._sanity_check_key(self, key)
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1389
1374
if self.node_ref_lists:
1390
1375
value, node_refs = nodes[key]
1394
1379
except KeyError:
1397
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1400
1418
def key_count(self):
1401
1419
"""Return an estimate of the number of keys in this index.
1427
1445
signature = bytes[0:len(self._signature())]
1428
1446
if not signature == self._signature():
1429
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1430
1448
lines = bytes[len(self._signature()):].splitlines()
1431
1449
options_line = lines[0]
1432
1450
if not options_line.startswith(_OPTION_NODE_REFS):
1433
raise index.BadIndexOptions(self)
1451
raise errors.BadIndexOptions(self)
1435
1453
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1436
1454
except ValueError:
1437
raise index.BadIndexOptions(self)
1455
raise errors.BadIndexOptions(self)
1438
1456
options_line = lines[1]
1439
1457
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1440
raise index.BadIndexOptions(self)
1458
raise errors.BadIndexOptions(self)
1442
1460
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1443
1461
except ValueError:
1444
raise index.BadIndexOptions(self)
1462
raise errors.BadIndexOptions(self)
1445
1463
options_line = lines[2]
1446
1464
if not options_line.startswith(_OPTION_LEN):
1447
raise index.BadIndexOptions(self)
1465
raise errors.BadIndexOptions(self)
1449
1467
self._key_count = int(options_line[len(_OPTION_LEN):])
1450
1468
except ValueError:
1451
raise index.BadIndexOptions(self)
1469
raise errors.BadIndexOptions(self)
1452
1470
options_line = lines[3]
1453
1471
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1454
raise index.BadIndexOptions(self)
1472
raise errors.BadIndexOptions(self)
1456
self._row_lengths = [int(length) for length in
1457
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1459
1477
except ValueError:
1460
raise index.BadIndexOptions(self)
1478
raise errors.BadIndexOptions(self)
1461
1479
self._compute_row_offsets()
1463
1481
# calculate the bytes we have processed
1496
1514
self._size = num_bytes - base_offset
1497
1515
# the whole thing should be parsed out of 'bytes'
1498
1516
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1499
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1502
1520
if offset > self._size:
1528
1546
bytes = zlib.decompress(data)
1529
1547
if bytes.startswith(_LEAF_FLAG):
1530
node = self._leaf_factory(bytes, self._key_length,
1531
self.node_ref_lists)
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1532
1549
elif bytes.startswith(_INTERNAL_FLAG):
1533
1550
node = _InternalNode(bytes)
1535
1552
raise AssertionError("Unknown node type for %r" % bytes)
1536
yield offset // _PAGE_SIZE, node
1553
yield offset / _PAGE_SIZE, node
1538
1555
def _signature(self):
1539
1556
"""The file signature for this index type."""
1549
1566
# We shouldn't be reading anything anyway
1551
1568
node_end = self._row_offsets[-1]
1552
for node in self._read_nodes(list(range(start_node, node_end))):
1569
for node in self._read_nodes(range(start_node, node_end)):
1556
_gcchk_factory = _LeafNode
1559
from . import _btree_serializer_pyx as _btree_serializer
1560
_gcchk_factory = _btree_serializer._parse_into_chk
1561
except ImportError as e:
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1562
1576
osutils.failed_to_load_extension(e)
1563
from . import _btree_serializer_py as _btree_serializer
1577
from bzrlib import _btree_serializer_py as _btree_serializer