18
18
"""B+Tree indices"""
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
20
from __future__ import absolute_import
24
from bzrlib.lazy_import import lazy_import
23
25
lazy_import(globals(), """
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
54
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
55
_OPTION_ROW_LENGTHS = b"row_lengths="
56
_LEAF_FLAG = b"type=leaf\n"
57
_INTERNAL_FLAG = b"type=internal\n"
58
_INTERNAL_OFFSET = b"offset="
44
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
47
_BTSIGNATURE = "B+Tree Graph Index 2\n"
48
_OPTION_ROW_LENGTHS = "row_lengths="
49
_LEAF_FLAG = "type=leaf\n"
50
_INTERNAL_FLAG = "type=internal\n"
51
_INTERNAL_OFFSET = "offset="
60
53
_RESERVED_HEADER_BYTES = 120
81
74
def finish_node(self, pad=True):
82
75
byte_lines, _, padding = self.writer.finish()
83
76
if self.nodes == 0:
84
self.spool = BytesIO()
77
self.spool = cStringIO.StringIO()
86
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
79
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
87
80
elif self.nodes == 1:
88
81
# We got bigger than 1 node, switch to a temp file
89
82
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
178
171
# we don't care about absent_references
179
172
node_refs, _ = self._check_key_ref_value(key, references, value)
180
173
if key in self._nodes:
181
raise index.BadIndexDuplicateKey(key, self)
174
raise errors.BadIndexDuplicateKey(key, self)
182
175
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
183
176
if self._nodes_by_key is not None and self._key_length > 1:
184
177
self._update_nodes_by_key(key, value, node_refs)
284
277
# undecorate back to (pos, node)
285
278
selected = selected[1]
286
279
if last == selected[1][1]:
287
raise index.BadIndexDuplicateKey(last, self)
280
raise errors.BadIndexDuplicateKey(last, self)
288
281
last = selected[1][1]
289
282
# Yield, with self as the index
290
283
yield (self,) + selected[1][1:]
291
284
pos = selected[0]
293
current_values[pos] = next(iterators_to_combine[pos])
286
current_values[pos] = iterators_to_combine[pos].next()
294
287
except StopIteration:
295
288
current_values[pos] = None
322
315
optimize_for_size=optimize_for_size)
323
316
internal_row.writer.write(_INTERNAL_FLAG)
324
317
internal_row.writer.write(_INTERNAL_OFFSET +
325
b"%d\n" % rows[pos + 1].nodes)
318
str(rows[pos + 1].nodes) + "\n")
327
320
length = _PAGE_SIZE
328
321
if rows[-1].nodes == 0:
335
328
# then line is too big. raising the error avoids infinite recursion
336
329
# searching for a suitably large page that will not be found.
338
raise index.BadIndexKey(string_key)
331
raise errors.BadIndexKey(string_key)
339
332
# this key did not fit in the node:
340
333
rows[-1].finish_node()
341
key_line = string_key + b"\n"
334
key_line = string_key + "\n"
343
336
for row in reversed(rows[:-1]):
344
337
# Mark the start of the next node in the node above. If it
366
359
optimize_for_size=self._optimize_for_size)
367
360
new_row.writer.write(_INTERNAL_FLAG)
368
361
new_row.writer.write(_INTERNAL_OFFSET +
369
b"%d\n" % (rows[1].nodes - 1))
362
str(rows[1].nodes - 1) + "\n")
370
363
new_row.writer.write(key_line)
371
364
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
404
397
self.reference_lists)
405
398
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
406
399
for row in reversed(rows):
407
pad = (not isinstance(row, _LeafBuilderRow))
400
pad = (type(row) != _LeafBuilderRow)
408
401
row.finish_node(pad=pad)
409
402
lines = [_BTSIGNATURE]
410
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
411
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
412
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
403
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
404
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
405
lines.append(_OPTION_LEN + str(key_count) + '\n')
413
406
row_lengths = [row.nodes for row in rows]
414
lines.append(_OPTION_ROW_LENGTHS + ','.join(
415
map(str, row_lengths)).encode('ascii') + b'\n')
407
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
416
408
if row_lengths and row_lengths[-1] > 1:
417
409
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
411
result = cStringIO.StringIO()
420
412
result.writelines(lines)
421
413
position = sum(map(len, lines))
434
426
node = row.spool.read(_PAGE_SIZE)
435
427
result.write(node[reserved:])
436
428
if len(node) == _PAGE_SIZE:
437
result.write(b"\x00" * (reserved - position))
429
result.write("\x00" * (reserved - position))
438
430
position = 0 # Only the root row actually has an offset
439
431
copied_len = osutils.pumpfile(row.spool, result)
440
432
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
if not isinstance(row, _LeafBuilderRow):
433
if type(row) != _LeafBuilderRow:
442
434
raise AssertionError("Incorrect amount of data copied"
443
435
" expected: %d, got: %d"
444
436
% ((row.nodes - 1) * _PAGE_SIZE,
544
538
yield (self,) + node[1:]
545
539
if self._key_length == 1:
547
index._sanity_check_key(self, key)
543
raise errors.BadIndexKey(key)
544
if len(key) != self._key_length:
545
raise errors.BadIndexKey(key)
549
547
node = self._nodes[key]
555
553
yield self, key, node[1]
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
558
raise errors.BadIndexKey(key)
559
if len(key) != self._key_length:
560
raise errors.BadIndexKey(key)
561
# find what it refers to:
562
key_dict = self._get_nodes_by_key()
564
# find the subdict to return
566
while len(elements) and elements[0] is not None:
567
key_dict = key_dict[elements[0]]
570
# a non-existant lookup.
575
key_dict = dicts.pop(-1)
576
# can't be empty or would not exist
577
item, value = key_dict.iteritems().next()
578
if type(value) == dict:
580
dicts.extend(key_dict.itervalues())
583
for value in key_dict.itervalues():
584
yield (self, ) + tuple(value)
586
yield (self, ) + key_dict
561
588
def _get_nodes_by_key(self):
562
589
if self._nodes_by_key is None:
563
590
nodes_by_key = {}
564
591
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
592
for key, (references, value) in self._nodes.iteritems():
566
593
key_dict = nodes_by_key
567
594
for subkey in key[:-1]:
568
595
key_dict = key_dict.setdefault(subkey, {})
569
596
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
598
for key, (references, value) in self._nodes.iteritems():
572
599
key_dict = nodes_by_key
573
600
for subkey in key[:-1]:
574
601
key_dict = key_dict.setdefault(subkey, {})
609
636
def all_items(self):
610
637
"""Return a sorted list of (key, (value, refs)) items"""
611
items = sorted(self.items())
614
642
def all_keys(self):
615
643
"""Return a sorted list of all keys."""
616
keys = sorted(self.keys())
625
654
def __init__(self, bytes):
626
655
"""Parse bytes to create an internal node object."""
627
656
# splitlines mangles the \r delimiters.. don't use it.
628
self.keys = self._parse_lines(bytes.split(b'\n'))
657
self.keys = self._parse_lines(bytes.split('\n'))
630
659
def _parse_lines(self, lines):
632
661
self.offset = int(lines[1][7:])
633
662
as_st = static_tuple.StaticTuple.from_sequence
634
663
for line in lines[2:]:
637
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
638
# to recheck performance and perhaps adapt StaticTuple to adjust.
639
nodes.append(as_st(line.split(b'\0')).intern())
666
nodes.append(as_st(map(intern, line.split('\0'))).intern())
687
714
self._row_lengths = None
688
715
self._row_offsets = None # Start of each row, [-1] is the end
693
717
def __eq__(self, other):
694
718
"""Equal when self and other were created with the same parameters."""
696
isinstance(self, type(other)) and
720
type(self) == type(other) and
697
721
self._transport == other._transport and
698
722
self._name == other._name and
699
723
self._size == other._size)
736
760
pages fit in that length.
738
762
recommended_read = self._transport.recommended_page_size()
739
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
763
recommended_pages = int(math.ceil(recommended_read /
740
765
return recommended_pages
742
767
def _compute_total_pages_in_index(self):
753
778
return self._row_offsets[-1]
754
779
# This is the number of pages as defined by the size of the index. They
755
780
# should be indentical.
756
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
781
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
757
782
return total_pages
759
784
def _expand_offsets(self, offsets):
790
815
if total_pages - len(cached_offsets) <= self._recommended_pages:
791
816
# Read whatever is left
792
817
if cached_offsets:
793
expanded = [x for x in range(total_pages)
818
expanded = [x for x in xrange(total_pages)
794
819
if x not in cached_offsets]
796
expanded = list(range(total_pages))
821
expanded = range(total_pages)
797
822
if 'index' in debug.debug_flags:
798
823
trace.mutter(' reading all unread pages: %s', expanded)
913
938
def _get_offsets_to_cached_pages(self):
914
939
"""Determine what nodes we already have cached."""
915
cached_offsets = set(self._internal_node_cache)
916
# cache may be dict or LRUCache, keys() is the common method
940
cached_offsets = set(self._internal_node_cache.keys())
917
941
cached_offsets.update(self._leaf_node_cache.keys())
918
942
if self._root_node is not None:
919
943
cached_offsets.add(0)
952
976
def _cache_leaf_values(self, nodes):
953
977
"""Cache directly from key => value, skipping the btree."""
954
978
if self._leaf_value_cache is not None:
955
for node in viewvalues(nodes):
979
for node in nodes.itervalues():
956
980
for key, value in node.all_items():
957
981
if key in self._leaf_value_cache:
958
982
# Don't add the rest of the keys, we've seen this node
992
1016
start_of_leaves = self._row_offsets[-2]
993
1017
end_of_leaves = self._row_offsets[-1]
994
needed_offsets = list(range(start_of_leaves, end_of_leaves))
1018
needed_offsets = range(start_of_leaves, end_of_leaves)
995
1019
if needed_offsets == [0]:
996
1020
# Special case when we only have a root node, as we have already
997
1021
# read everything
1045
1069
# return [(o, offsets[o]) for o in sorted(offsets)]
1046
1070
in_keys_iter = iter(in_keys)
1047
1071
fixed_keys_iter = enumerate(fixed_keys)
1048
cur_in_key = next(in_keys_iter)
1049
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1072
cur_in_key = in_keys_iter.next()
1073
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1051
1075
class InputDone(Exception): pass
1052
1076
class FixedDone(Exception): pass
1068
1092
while cur_in_key < cur_fixed_key:
1069
1093
cur_keys.append(cur_in_key)
1071
cur_in_key = next(in_keys_iter)
1095
cur_in_key = in_keys_iter.next()
1072
1096
except StopIteration:
1073
1097
raise InputDone
1074
1098
# At this point cur_in_key must be >= cur_fixed_key
1336
1360
self._get_root_node()
1337
1361
# TODO: only access nodes that can satisfy the prefixes we are looking
1338
1362
# for. For now, to meet API usage (as this function is not used by
1339
# current breezy) just suck the entire index and iterate in memory.
1363
# current bzrlib) just suck the entire index and iterate in memory.
1341
1365
if self.node_ref_lists:
1342
1366
if self._key_length == 1:
1368
1392
key_dict[key[-1]] = key_value
1369
1393
if self._key_length == 1:
1370
1394
for key in keys:
1371
index._sanity_check_key(self, key)
1397
raise errors.BadIndexKey(key)
1398
if len(key) != self._key_length:
1399
raise errors.BadIndexKey(key)
1373
1401
if self.node_ref_lists:
1374
1402
value, node_refs = nodes[key]
1378
1406
except KeyError:
1381
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1412
raise errors.BadIndexKey(key)
1413
if len(key) != self._key_length:
1414
raise errors.BadIndexKey(key)
1415
# find what it refers to:
1416
key_dict = nodes_by_key
1417
elements = list(key)
1418
# find the subdict whose contents should be returned.
1420
while len(elements) and elements[0] is not None:
1421
key_dict = key_dict[elements[0]]
1424
# a non-existant lookup.
1429
key_dict = dicts.pop(-1)
1430
# can't be empty or would not exist
1431
item, value = key_dict.iteritems().next()
1432
if type(value) == dict:
1434
dicts.extend(key_dict.itervalues())
1437
for value in key_dict.itervalues():
1438
# each value is the key:value:node refs tuple
1440
yield (self, ) + value
1442
# the last thing looked up was a terminal element
1443
yield (self, ) + key_dict
1384
1445
def key_count(self):
1385
1446
"""Return an estimate of the number of keys in this index.
1411
1472
signature = bytes[0:len(self._signature())]
1412
1473
if not signature == self._signature():
1413
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1474
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1414
1475
lines = bytes[len(self._signature()):].splitlines()
1415
1476
options_line = lines[0]
1416
1477
if not options_line.startswith(_OPTION_NODE_REFS):
1417
raise index.BadIndexOptions(self)
1478
raise errors.BadIndexOptions(self)
1419
1480
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1420
1481
except ValueError:
1421
raise index.BadIndexOptions(self)
1482
raise errors.BadIndexOptions(self)
1422
1483
options_line = lines[1]
1423
1484
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1424
raise index.BadIndexOptions(self)
1485
raise errors.BadIndexOptions(self)
1426
1487
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1427
1488
except ValueError:
1428
raise index.BadIndexOptions(self)
1489
raise errors.BadIndexOptions(self)
1429
1490
options_line = lines[2]
1430
1491
if not options_line.startswith(_OPTION_LEN):
1431
raise index.BadIndexOptions(self)
1492
raise errors.BadIndexOptions(self)
1433
1494
self._key_count = int(options_line[len(_OPTION_LEN):])
1434
1495
except ValueError:
1435
raise index.BadIndexOptions(self)
1496
raise errors.BadIndexOptions(self)
1436
1497
options_line = lines[3]
1437
1498
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1438
raise index.BadIndexOptions(self)
1499
raise errors.BadIndexOptions(self)
1440
self._row_lengths = [int(length) for length in
1441
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1501
self._row_lengths = map(int, [length for length in
1502
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1443
1504
except ValueError:
1444
raise index.BadIndexOptions(self)
1505
raise errors.BadIndexOptions(self)
1445
1506
self._compute_row_offsets()
1447
1508
# calculate the bytes we have processed
1480
1541
self._size = num_bytes - base_offset
1481
1542
# the whole thing should be parsed out of 'bytes'
1482
1543
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1483
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1544
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1486
1547
if offset > self._size:
1517
1578
node = _InternalNode(bytes)
1519
1580
raise AssertionError("Unknown node type for %r" % bytes)
1520
yield offset // _PAGE_SIZE, node
1581
yield offset / _PAGE_SIZE, node
1522
1583
def _signature(self):
1523
1584
"""The file signature for this index type."""
1533
1594
# We shouldn't be reading anything anyway
1535
1596
node_end = self._row_offsets[-1]
1536
for node in self._read_nodes(list(range(start_node, node_end))):
1597
for node in self._read_nodes(range(start_node, node_end)):
1540
1601
_gcchk_factory = _LeafNode
1543
from . import _btree_serializer_pyx as _btree_serializer
1604
from bzrlib import _btree_serializer_pyx as _btree_serializer
1544
1605
_gcchk_factory = _btree_serializer._parse_into_chk
1545
except ImportError as e:
1606
except ImportError, e:
1546
1607
osutils.failed_to_load_extension(e)
1547
from . import _btree_serializer_py as _btree_serializer
1608
from bzrlib import _btree_serializer_py as _btree_serializer