1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
47
60
_RESERVED_HEADER_BYTES = 120
68
81
def finish_node(self, pad=True):
69
82
byte_lines, _, padding = self.writer.finish()
70
83
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
84
self.spool = BytesIO()
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
74
87
elif self.nodes == 1:
75
88
# We got bigger than 1 node, switch to a temp file
76
89
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
158
171
:param references: An iterable of iterables of keys. Each is a
159
172
reference to another key.
160
173
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
174
bytes as long as it does not contain \\0 or \\n.
163
176
# Ensure that 'key' is a StaticTuple
164
177
key = static_tuple.StaticTuple.from_sequence(key).intern()
165
178
# we don't care about absent_references
166
179
node_refs, _ = self._check_key_ref_value(key, references, value)
167
180
if key in self._nodes:
168
raise errors.BadIndexDuplicateKey(key, self)
181
raise index.BadIndexDuplicateKey(key, self)
169
182
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
170
183
if self._nodes_by_key is not None and self._key_length > 1:
171
184
self._update_nodes_by_key(key, value, node_refs)
193
206
new_backing_file, size = self._spill_mem_keys_without_combining()
194
207
# Note: The transport here isn't strictly needed, because we will use
195
208
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
209
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
211
# GC will clean up the file
198
212
new_backing._file = new_backing_file
199
213
if self._combine_backing_indices:
270
284
# undecorate back to (pos, node)
271
285
selected = selected[1]
272
286
if last == selected[1][1]:
273
raise errors.BadIndexDuplicateKey(last, self)
287
raise index.BadIndexDuplicateKey(last, self)
274
288
last = selected[1][1]
275
289
# Yield, with self as the index
276
290
yield (self,) + selected[1][1:]
277
291
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
293
current_values[pos] = next(iterators_to_combine[pos])
280
294
except StopIteration:
281
295
current_values[pos] = None
289
303
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
307
if rows[-1].writer is None:
293
308
# opening a new leaf chunk;
294
310
for pos, internal_row in enumerate(rows[:-1]):
295
311
# flesh out any internal nodes that are needed to
296
312
# preserve the height of the tree
306
322
optimize_for_size=optimize_for_size)
307
323
internal_row.writer.write(_INTERNAL_FLAG)
308
324
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
325
b"%d\n" % rows[pos + 1].nodes)
311
327
length = _PAGE_SIZE
312
328
if rows[-1].nodes == 0:
315
331
optimize_for_size=self._optimize_for_size)
316
332
rows[-1].writer.write(_LEAF_FLAG)
317
333
if rows[-1].writer.write(line):
334
# if we failed to write, despite having an empty page to write to,
335
# then line is too big. raising the error avoids infinite recursion
336
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
318
339
# this key did not fit in the node:
319
340
rows[-1].finish_node()
320
key_line = string_key + "\n"
341
key_line = string_key + b"\n"
322
343
for row in reversed(rows[:-1]):
323
344
# Mark the start of the next node in the node above. If it
345
366
optimize_for_size=self._optimize_for_size)
346
367
new_row.writer.write(_INTERNAL_FLAG)
347
368
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
369
b"%d\n" % (rows[1].nodes - 1))
349
370
new_row.writer.write(key_line)
350
371
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
383
404
self.reference_lists)
384
405
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
385
406
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
407
pad = (not isinstance(row, _LeafBuilderRow))
387
408
row.finish_node(pad=pad)
388
409
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
413
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
394
416
if row_lengths and row_lengths[-1] > 1:
395
417
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
420
result.writelines(lines)
399
421
position = sum(map(len, lines))
412
434
node = row.spool.read(_PAGE_SIZE)
413
435
result.write(node[reserved:])
414
436
if len(node) == _PAGE_SIZE:
415
result.write("\x00" * (reserved - position))
437
result.write(b"\x00" * (reserved - position))
416
438
position = 0 # Only the root row actually has an offset
417
439
copied_len = osutils.pumpfile(row.spool, result)
418
440
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
441
if not isinstance(row, _LeafBuilderRow):
420
442
raise AssertionError("Incorrect amount of data copied"
421
443
" expected: %d, got: %d"
422
444
% ((row.nodes - 1) * _PAGE_SIZE,
524
544
yield (self,) + node[1:]
525
545
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
547
index._sanity_check_key(self, key)
533
549
node = self._nodes[key]
539
555
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
561
def _get_nodes_by_key(self):
575
562
if self._nodes_by_key is None:
576
563
nodes_by_key = {}
577
564
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
565
for key, (references, value) in viewitems(self._nodes):
579
566
key_dict = nodes_by_key
580
567
for subkey in key[:-1]:
581
568
key_dict = key_dict.setdefault(subkey, {})
582
569
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
571
for key, (references, value) in viewitems(self._nodes):
585
572
key_dict = nodes_by_key
586
573
for subkey in key[:-1]:
587
574
key_dict = key_dict.setdefault(subkey, {})
601
588
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
591
class _LeafNode(dict):
605
592
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
594
__slots__ = ('min_key', 'max_key', '_keys')
609
596
def __init__(self, bytes, key_length, ref_list_length):
610
597
"""Parse bytes to create a leaf node object."""
616
603
self.max_key = key_list[-1][0]
618
605
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
606
super(_LeafNode, self).__init__(key_list)
607
self._keys = dict(self)
610
"""Return a sorted list of (key, (value, refs)) items"""
611
items = sorted(self.items())
615
"""Return a sorted list of all keys."""
616
keys = sorted(self.keys())
622
620
class _InternalNode(object):
627
625
def __init__(self, bytes):
628
626
"""Parse bytes to create an internal node object."""
629
627
# splitlines mangles the \r delimiters.. don't use it.
630
self.keys = self._parse_lines(bytes.split('\n'))
628
self.keys = self._parse_lines(bytes.split(b'\n'))
632
630
def _parse_lines(self, lines):
634
632
self.offset = int(lines[1][7:])
635
633
as_st = static_tuple.StaticTuple.from_sequence
636
634
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
637
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
638
# to recheck performance and perhaps adapt StaticTuple to adjust.
639
nodes.append(as_st(line.split(b'\0')).intern())
671
671
self._recommended_pages = self._compute_recommended_pages()
672
672
self._root_node = None
673
673
self._base_offset = offset
674
self._leaf_factory = _LeafNode
674
675
# Default max size is 100,000 leave values
675
676
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
677
if unlimited_cache:
686
687
self._row_lengths = None
687
688
self._row_offsets = None # Start of each row, [-1] is the end
689
693
def __eq__(self, other):
690
694
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
696
isinstance(self, type(other)) and
693
697
self._transport == other._transport and
694
698
self._name == other._name and
695
699
self._size == other._size)
732
736
pages fit in that length.
734
738
recommended_read = self._transport.recommended_page_size()
735
recommended_pages = int(math.ceil(recommended_read /
739
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
737
740
return recommended_pages
739
742
def _compute_total_pages_in_index(self):
750
753
return self._row_offsets[-1]
751
754
# This is the number of pages as defined by the size of the index. They
752
755
# should be indentical.
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
756
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
754
757
return total_pages
756
759
def _expand_offsets(self, offsets):
787
790
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
791
# Read whatever is left
789
792
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
793
expanded = [x for x in range(total_pages)
791
794
if x not in cached_offsets]
793
expanded = range(total_pages)
796
expanded = list(range(total_pages))
794
797
if 'index' in debug.debug_flags:
795
798
trace.mutter(' reading all unread pages: %s', expanded)
910
913
def _get_offsets_to_cached_pages(self):
911
914
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
915
cached_offsets = set(self._internal_node_cache)
916
# cache may be dict or LRUCache, keys() is the common method
913
917
cached_offsets.update(self._leaf_node_cache.keys())
914
918
if self._root_node is not None:
915
919
cached_offsets.add(0)
948
952
def _cache_leaf_values(self, nodes):
949
953
"""Cache directly from key => value, skipping the btree."""
950
954
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
955
for node in viewvalues(nodes):
956
for key, value in node.all_items():
953
957
if key in self._leaf_value_cache:
954
958
# Don't add the rest of the keys, we've seen this node
979
983
if self._row_offsets[-1] == 1:
980
984
# There is only the root node, and we read that via key_count()
981
985
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
986
for key, (value, refs) in self._root_node.all_items():
983
987
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
989
for key, (value, refs) in self._root_node.all_items():
986
990
yield (self, key, value)
988
992
start_of_leaves = self._row_offsets[-2]
989
993
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
994
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
995
if needed_offsets == [0]:
992
996
# Special case when we only have a root node, as we have already
993
997
# read everything
998
1002
# for spilling index builds to disk.
999
1003
if self.node_ref_lists:
1000
1004
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1005
for key, (value, refs) in node.all_items():
1002
1006
yield (self, key, value, refs)
1004
1008
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1009
for key, (value, refs) in node.all_items():
1006
1010
yield (self, key, value)
1032
1036
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1037
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
1038
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1039
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1040
# elif bisect_steps < iter_steps:
1038
1042
# for key in in_keys:
1041
1045
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1046
in_keys_iter = iter(in_keys)
1043
1047
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1048
cur_in_key = next(in_keys_iter)
1049
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1047
1051
class InputDone(Exception): pass
1048
1052
class FixedDone(Exception): pass
1064
1068
while cur_in_key < cur_fixed_key:
1065
1069
cur_keys.append(cur_in_key)
1067
cur_in_key = in_keys_iter.next()
1071
cur_in_key = next(in_keys_iter)
1068
1072
except StopIteration:
1069
1073
raise InputDone
1070
1074
# At this point cur_in_key must be >= cur_fixed_key
1170
1174
node = nodes[node_index]
1171
1175
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1176
if next_sub_key in node:
1177
value, refs = node[next_sub_key]
1174
1178
if self.node_ref_lists:
1175
1179
yield (self, next_sub_key, value, refs)
1244
1248
# sub_keys is all of the keys we are looking for that should exist
1245
1249
# on this page, if they aren't here, then they won't be found
1246
1250
node = nodes[node_index]
1247
node_keys = node.keys
1248
1251
parents_to_check = set()
1249
1252
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1253
if next_sub_key not in node:
1251
1254
# This one is just not present in the index at all
1252
1255
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1257
value, refs = node[next_sub_key]
1255
1258
parent_keys = refs[ref_list_num]
1256
1259
parent_map[next_sub_key] = parent_keys
1257
1260
parents_to_check.update(parent_keys)
1264
1267
while parents_to_check:
1265
1268
next_parents_to_check = set()
1266
1269
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1271
value, refs = node[key]
1269
1272
parent_keys = refs[ref_list_num]
1270
1273
parent_map[key] = parent_keys
1271
1274
next_parents_to_check.update(parent_keys)
1333
1336
self._get_root_node()
1334
1337
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1338
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1339
# current breezy) just suck the entire index and iterate in memory.
1338
1341
if self.node_ref_lists:
1339
1342
if self._key_length == 1:
1365
1368
key_dict[key[-1]] = key_value
1366
1369
if self._key_length == 1:
1367
1370
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1371
index._sanity_check_key(self, key)
1374
1373
if self.node_ref_lists:
1375
1374
value, node_refs = nodes[key]
1379
1378
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1381
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1384
def key_count(self):
1419
1385
"""Return an estimate of the number of keys in this index.
1445
1411
signature = bytes[0:len(self._signature())]
1446
1412
if not signature == self._signature():
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1413
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1448
1414
lines = bytes[len(self._signature()):].splitlines()
1449
1415
options_line = lines[0]
1450
1416
if not options_line.startswith(_OPTION_NODE_REFS):
1451
raise errors.BadIndexOptions(self)
1417
raise index.BadIndexOptions(self)
1453
1419
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1454
1420
except ValueError:
1455
raise errors.BadIndexOptions(self)
1421
raise index.BadIndexOptions(self)
1456
1422
options_line = lines[1]
1457
1423
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1458
raise errors.BadIndexOptions(self)
1424
raise index.BadIndexOptions(self)
1460
1426
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1461
1427
except ValueError:
1462
raise errors.BadIndexOptions(self)
1428
raise index.BadIndexOptions(self)
1463
1429
options_line = lines[2]
1464
1430
if not options_line.startswith(_OPTION_LEN):
1465
raise errors.BadIndexOptions(self)
1431
raise index.BadIndexOptions(self)
1467
1433
self._key_count = int(options_line[len(_OPTION_LEN):])
1468
1434
except ValueError:
1469
raise errors.BadIndexOptions(self)
1435
raise index.BadIndexOptions(self)
1470
1436
options_line = lines[3]
1471
1437
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
raise errors.BadIndexOptions(self)
1438
raise index.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1440
self._row_lengths = [int(length) for length in
1441
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1477
1443
except ValueError:
1478
raise errors.BadIndexOptions(self)
1444
raise index.BadIndexOptions(self)
1479
1445
self._compute_row_offsets()
1481
1447
# calculate the bytes we have processed
1514
1480
self._size = num_bytes - base_offset
1515
1481
# the whole thing should be parsed out of 'bytes'
1516
1482
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1483
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1520
1486
if offset > self._size:
1546
1512
bytes = zlib.decompress(data)
1547
1513
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1514
node = self._leaf_factory(bytes, self._key_length,
1515
self.node_ref_lists)
1549
1516
elif bytes.startswith(_INTERNAL_FLAG):
1550
1517
node = _InternalNode(bytes)
1552
1519
raise AssertionError("Unknown node type for %r" % bytes)
1553
yield offset / _PAGE_SIZE, node
1520
yield offset // _PAGE_SIZE, node
1555
1522
def _signature(self):
1556
1523
"""The file signature for this index type."""
1566
1533
# We shouldn't be reading anything anyway
1568
1535
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1536
for node in self._read_nodes(list(range(start_node, node_end))):
1540
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1543
from . import _btree_serializer_pyx as _btree_serializer
1544
_gcchk_factory = _btree_serializer._parse_into_chk
1545
except ImportError as e:
1576
1546
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1547
from . import _btree_serializer_py as _btree_serializer