1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
1
# Copyright (C) 2008-2011 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
21
from bisect import bisect_right
20
from io import BytesIO
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
46
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
47
_OPTION_ROW_LENGTHS = b"row_lengths="
48
_LEAF_FLAG = b"type=leaf\n"
49
_INTERNAL_FLAG = b"type=internal\n"
50
_INTERNAL_OFFSET = b"offset="
47
52
_RESERVED_HEADER_BYTES = 120
62
67
def __init__(self):
63
68
"""Create a _BuilderRow."""
65
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
70
self.spool = None # tempfile.TemporaryFile(prefix='bzr-index-row-')
68
73
def finish_node(self, pad=True):
69
74
byte_lines, _, padding = self.writer.finish()
70
75
if self.nodes == 0:
71
self.spool = cStringIO.StringIO()
76
self.spool = BytesIO()
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
78
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
74
79
elif self.nodes == 1:
75
80
# We got bigger than 1 node, switch to a temp file
76
81
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
137
142
of nodes that BTreeBuilder will hold in memory.
139
144
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
140
key_elements=key_elements)
145
key_elements=key_elements)
141
146
self._spill_at = spill_at
142
147
self._backing_indices = []
143
148
# A map of {key: (node_refs, value)}
158
163
:param references: An iterable of iterables of keys. Each is a
159
164
reference to another key.
160
165
:param value: The value to associate with the key. It may be any
161
bytes as long as it does not contain \0 or \n.
166
bytes as long as it does not contain \\0 or \\n.
163
168
# Ensure that 'key' is a StaticTuple
164
169
key = static_tuple.StaticTuple.from_sequence(key).intern()
165
170
# we don't care about absent_references
166
171
node_refs, _ = self._check_key_ref_value(key, references, value)
167
172
if key in self._nodes:
168
raise errors.BadIndexDuplicateKey(key, self)
173
raise index.BadIndexDuplicateKey(key, self)
169
174
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
170
175
if self._nodes_by_key is not None and self._key_length > 1:
171
176
self._update_nodes_by_key(key, value, node_refs)
193
198
new_backing_file, size = self._spill_mem_keys_without_combining()
194
199
# Note: The transport here isn't strictly needed, because we will use
195
200
# direct access to the new_backing._file object
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
201
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
197
203
# GC will clean up the file
198
204
new_backing._file = new_backing_file
199
205
if self._combine_backing_indices:
256
262
current_values = []
257
263
for iterator in iterators_to_combine:
259
current_values.append(iterator.next())
265
current_values.append(next(iterator))
260
266
except StopIteration:
261
267
current_values.append(None)
264
270
# Decorate candidates with the value to allow 2.4's min to be used.
265
271
candidates = [(item[1][1], item) for item
266
in enumerate(current_values) if item[1] is not None]
272
in enumerate(current_values) if item[1] is not None]
267
273
if not len(candidates):
269
275
selected = min(candidates)
270
276
# undecorate back to (pos, node)
271
277
selected = selected[1]
272
278
if last == selected[1][1]:
273
raise errors.BadIndexDuplicateKey(last, self)
279
raise index.BadIndexDuplicateKey(last, self)
274
280
last = selected[1][1]
275
281
# Yield, with self as the index
276
282
yield (self,) + selected[1][1:]
277
283
pos = selected[0]
279
current_values[pos] = iterators_to_combine[pos].next()
285
current_values[pos] = next(iterators_to_combine[pos])
280
286
except StopIteration:
281
287
current_values[pos] = None
289
295
flag when writing out. This is used by the _spill_mem_keys_to_disk
292
299
if rows[-1].writer is None:
293
300
# opening a new leaf chunk;
294
302
for pos, internal_row in enumerate(rows[:-1]):
295
303
# flesh out any internal nodes that are needed to
296
304
# preserve the height of the tree
297
305
if internal_row.writer is None:
298
306
length = _PAGE_SIZE
299
307
if internal_row.nodes == 0:
300
length -= _RESERVED_HEADER_BYTES # padded
308
length -= _RESERVED_HEADER_BYTES # padded
301
309
if allow_optimize:
302
310
optimize_for_size = self._optimize_for_size
304
312
optimize_for_size = False
305
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
306
optimize_for_size=optimize_for_size)
313
internal_row.writer = chunk_writer.ChunkWriter(
314
length, 0, optimize_for_size=optimize_for_size)
307
315
internal_row.writer.write(_INTERNAL_FLAG)
308
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
316
internal_row.writer.write(_INTERNAL_OFFSET
317
+ b"%d\n" % rows[pos + 1].nodes)
311
319
length = _PAGE_SIZE
312
320
if rows[-1].nodes == 0:
313
length -= _RESERVED_HEADER_BYTES # padded
314
rows[-1].writer = chunk_writer.ChunkWriter(length,
315
optimize_for_size=self._optimize_for_size)
321
length -= _RESERVED_HEADER_BYTES # padded
322
rows[-1].writer = chunk_writer.ChunkWriter(
323
length, optimize_for_size=self._optimize_for_size)
316
324
rows[-1].writer.write(_LEAF_FLAG)
317
325
if rows[-1].writer.write(line):
326
# if we failed to write, despite having an empty page to write to,
327
# then line is too big. raising the error avoids infinite recursion
328
# searching for a suitably large page that will not be found.
330
raise index.BadIndexKey(string_key)
318
331
# this key did not fit in the node:
319
332
rows[-1].finish_node()
320
key_line = string_key + "\n"
333
key_line = string_key + b"\n"
322
335
for row in reversed(rows[:-1]):
323
336
# Mark the start of the next node in the node above. If it
345
358
optimize_for_size=self._optimize_for_size)
346
359
new_row.writer.write(_INTERNAL_FLAG)
347
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
360
new_row.writer.write(_INTERNAL_OFFSET
361
+ b"%d\n" % (rows[1].nodes - 1))
349
362
new_row.writer.write(key_line)
350
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
363
self._add_key(string_key, line, rows,
364
allow_optimize=allow_optimize)
352
366
def _write_nodes(self, node_iterator, allow_optimize=True):
353
367
"""Write node_iterator out as a B+Tree.
379
393
# First key triggers the first row
380
394
rows.append(_LeafBuilderRow())
382
string_key, line = _btree_serializer._flatten_node(node,
383
self.reference_lists)
384
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
396
string_key, line = _btree_serializer._flatten_node(
397
node, self.reference_lists)
398
self._add_key(string_key, line, rows,
399
allow_optimize=allow_optimize)
385
400
for row in reversed(rows):
386
pad = (type(row) != _LeafBuilderRow)
401
pad = (not isinstance(row, _LeafBuilderRow))
387
402
row.finish_node(pad=pad)
388
403
lines = [_BTSIGNATURE]
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
404
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
405
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
406
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
392
407
row_lengths = [row.nodes for row in rows]
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
408
lines.append(_OPTION_ROW_LENGTHS + ','.join(
409
map(str, row_lengths)).encode('ascii') + b'\n')
394
410
if row_lengths and row_lengths[-1] > 1:
395
411
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
398
414
result.writelines(lines)
399
415
position = sum(map(len, lines))
401
416
if position > _RESERVED_HEADER_BYTES:
402
417
raise AssertionError("Could not fit the header in the"
403
418
" reserved space: %d > %d"
404
419
% (position, _RESERVED_HEADER_BYTES))
405
420
# write the rows out:
407
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
422
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
408
423
row.spool.flush()
409
424
row.spool.seek(0)
410
425
# copy nodes to the finalised file.
412
427
node = row.spool.read(_PAGE_SIZE)
413
428
result.write(node[reserved:])
414
429
if len(node) == _PAGE_SIZE:
415
result.write("\x00" * (reserved - position))
416
position = 0 # Only the root row actually has an offset
430
result.write(b"\x00" * (reserved - position))
431
position = 0 # Only the root row actually has an offset
417
432
copied_len = osutils.pumpfile(row.spool, result)
418
433
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
419
if type(row) != _LeafBuilderRow:
434
if not isinstance(row, _LeafBuilderRow):
420
435
raise AssertionError("Incorrect amount of data copied"
421
" expected: %d, got: %d"
422
% ((row.nodes - 1) * _PAGE_SIZE,
436
" expected: %d, got: %d"
437
% ((row.nodes - 1) * _PAGE_SIZE,
425
440
size = result.tell()
442
457
efficient order for the index (in this case dictionary hash order).
444
459
if 'evil' in debug.debug_flags:
445
trace.mutter_callsite(3,
446
"iter_all_entries scales with size of history.")
460
trace.mutter_callsite(
461
3, "iter_all_entries scales with size of history.")
447
462
# Doing serial rather than ordered would be faster; but this shouldn't
448
463
# be getting called routinely anyway.
449
464
iterators = [self._iter_mem_nodes()]
458
473
"""Iterate over keys within the index.
460
475
:param keys: An iterable providing the keys to be retrieved.
461
:return: An iterable of (index, key, value, reference_lists). There is no
462
defined order for the result iteration - it will be in the most
476
:return: An iterable of (index, key, value, reference_lists). There is
477
no defined order for the result iteration - it will be in the most
463
478
efficient order for the index (keys iteration order in this case).
483
498
# Find things that are in backing indices that have not been handled
485
500
if not self._backing_indices:
486
return # We won't find anything there either
501
return # We won't find anything there either
487
502
# Remove all of the keys that we found locally
488
503
keys.difference_update(local_keys)
489
504
for backing in self._backing_indices:
524
537
yield (self,) + node[1:]
525
538
if self._key_length == 1:
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
540
index._sanity_check_key(self, key)
533
542
node = self._nodes[key]
539
548
yield self, key, node[1]
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
550
nodes_by_key = self._get_nodes_by_key()
551
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
574
554
def _get_nodes_by_key(self):
575
555
if self._nodes_by_key is None:
576
556
nodes_by_key = {}
577
557
if self.reference_lists:
578
for key, (references, value) in self._nodes.iteritems():
558
for key, (references, value) in self._nodes.items():
579
559
key_dict = nodes_by_key
580
560
for subkey in key[:-1]:
581
561
key_dict = key_dict.setdefault(subkey, {})
582
562
key_dict[key[-1]] = key, value, references
584
for key, (references, value) in self._nodes.iteritems():
564
for key, (references, value) in self._nodes.items():
585
565
key_dict = nodes_by_key
586
566
for subkey in key[:-1]:
587
567
key_dict = key_dict.setdefault(subkey, {})
595
575
For InMemoryGraphIndex the estimate is exact.
597
return len(self._nodes) + sum(backing.key_count() for backing in
598
self._backing_indices if backing is not None)
577
return len(self._nodes) + sum(
579
for backing in self._backing_indices
580
if backing is not None)
600
582
def validate(self):
601
583
"""In memory index's have no known corruption at the moment."""
604
class _LeafNode(object):
585
def __lt__(self, other):
586
if isinstance(other, type(self)):
587
return self._nodes < other._nodes
588
# Always sort existing indexes before ones that are still being built.
589
if isinstance(other, BTreeGraphIndex):
594
class _LeafNode(dict):
605
595
"""A leaf node for a serialised B+Tree index."""
607
__slots__ = ('keys', 'min_key', 'max_key')
597
__slots__ = ('min_key', 'max_key', '_keys')
609
599
def __init__(self, bytes, key_length, ref_list_length):
610
600
"""Parse bytes to create a leaf node object."""
611
601
# splitlines mangles the \r delimiters.. don't use it.
612
key_list = _btree_serializer._parse_leaf_lines(bytes,
613
key_length, ref_list_length)
602
key_list = _btree_serializer._parse_leaf_lines(
603
bytes, key_length, ref_list_length)
615
605
self.min_key = key_list[0][0]
616
606
self.max_key = key_list[-1][0]
618
608
self.min_key = self.max_key = None
619
self.keys = dict(key_list)
609
super(_LeafNode, self).__init__(key_list)
610
self._keys = dict(self)
613
"""Return a sorted list of (key, (value, refs)) items"""
614
items = sorted(self.items())
618
"""Return a sorted list of all keys."""
619
keys = sorted(self.keys())
622
623
class _InternalNode(object):
627
628
def __init__(self, bytes):
628
629
"""Parse bytes to create an internal node object."""
629
630
# splitlines mangles the \r delimiters.. don't use it.
630
self.keys = self._parse_lines(bytes.split('\n'))
631
self.keys = self._parse_lines(bytes.split(b'\n'))
632
633
def _parse_lines(self, lines):
634
635
self.offset = int(lines[1][7:])
635
636
as_st = static_tuple.StaticTuple.from_sequence
636
637
for line in lines[2:]:
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
640
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
641
# to recheck performance and perhaps adapt StaticTuple to adjust.
642
nodes.append(as_st(line.split(b'\0')).intern())
671
674
self._recommended_pages = self._compute_recommended_pages()
672
675
self._root_node = None
673
676
self._base_offset = offset
677
self._leaf_factory = _LeafNode
674
678
# Default max size is 100,000 leave values
675
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
679
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
676
680
if unlimited_cache:
677
681
self._leaf_node_cache = {}
678
682
self._internal_node_cache = {}
684
688
self._internal_node_cache = fifo_cache.FIFOCache(100)
685
689
self._key_count = None
686
690
self._row_lengths = None
687
self._row_offsets = None # Start of each row, [-1] is the end
691
self._row_offsets = None # Start of each row, [-1] is the end
689
696
def __eq__(self, other):
690
697
"""Equal when self and other were created with the same parameters."""
692
type(self) == type(other) and
693
self._transport == other._transport and
694
self._name == other._name and
695
self._size == other._size)
699
isinstance(self, type(other))
700
and self._transport == other._transport
701
and self._name == other._name
702
and self._size == other._size)
704
def __lt__(self, other):
705
if isinstance(other, type(self)):
706
return ((self._name, self._size) < (other._name, other._size))
707
# Always sort existing indexes before ones that are still being built.
708
if isinstance(other, BTreeBuilder):
697
712
def __ne__(self, other):
698
713
return not self.__eq__(other)
732
747
pages fit in that length.
734
749
recommended_read = self._transport.recommended_page_size()
735
recommended_pages = int(math.ceil(recommended_read /
750
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
737
751
return recommended_pages
739
753
def _compute_total_pages_in_index(self):
750
764
return self._row_offsets[-1]
751
765
# This is the number of pages as defined by the size of the index. They
752
766
# should be indentical.
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
767
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
754
768
return total_pages
756
770
def _expand_offsets(self, offsets):
787
801
if total_pages - len(cached_offsets) <= self._recommended_pages:
788
802
# Read whatever is left
789
803
if cached_offsets:
790
expanded = [x for x in xrange(total_pages)
791
if x not in cached_offsets]
804
expanded = [x for x in range(total_pages)
805
if x not in cached_offsets]
793
expanded = range(total_pages)
807
expanded = list(range(total_pages))
794
808
if 'index' in debug.debug_flags:
795
809
trace.mutter(' reading all unread pages: %s', expanded)
845
859
if first is None:
846
860
first, end = self._find_layer_first_and_end(pos)
847
861
previous = pos - 1
849
and previous not in cached_offsets
850
and previous not in final_offsets
851
and previous >= first):
863
previous not in cached_offsets and
864
previous not in final_offsets and
852
866
next_tips.add(previous)
854
if (after < total_pages
855
and after not in cached_offsets
856
and after not in final_offsets
868
if (after < total_pages and
869
after not in cached_offsets and
870
after not in final_offsets and
858
872
next_tips.add(after)
859
873
# This would keep us from going bigger than
860
874
# recommended_pages by only expanding the first offsets.
884
898
self._get_root_node()
885
899
if ref_list_num + 1 > self.node_ref_lists:
886
900
raise ValueError('No ref list %d, index has %d ref lists'
887
% (ref_list_num, self.node_ref_lists))
901
% (ref_list_num, self.node_ref_lists))
890
904
for node in self.iter_all_entries():
910
924
def _get_offsets_to_cached_pages(self):
911
925
"""Determine what nodes we already have cached."""
912
cached_offsets = set(self._internal_node_cache.keys())
926
cached_offsets = set(self._internal_node_cache)
927
# cache may be dict or LRUCache, keys() is the common method
913
928
cached_offsets.update(self._leaf_node_cache.keys())
914
929
if self._root_node is not None:
915
930
cached_offsets.add(0)
948
963
def _cache_leaf_values(self, nodes):
949
964
"""Cache directly from key => value, skipping the btree."""
950
965
if self._leaf_value_cache is not None:
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
966
for node in nodes.values():
967
for key, value in node.all_items():
953
968
if key in self._leaf_value_cache:
954
969
# Don't add the rest of the keys, we've seen this node
965
980
def iter_all_entries(self):
966
981
"""Iterate over all keys within the index.
968
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
983
:return: An iterable of (index, key, value) or
984
(index, key, value, reference_lists).
969
985
The former tuple is used when there are no reference lists in the
970
986
index, making the API compatible with simple key:value index types.
971
987
There is no defined order for the result iteration - it will be in
972
988
the most efficient order for the index.
974
990
if 'evil' in debug.debug_flags:
975
trace.mutter_callsite(3,
976
"iter_all_entries scales with size of history.")
991
trace.mutter_callsite(
992
3, "iter_all_entries scales with size of history.")
977
993
if not self.key_count():
979
995
if self._row_offsets[-1] == 1:
980
996
# There is only the root node, and we read that via key_count()
981
997
if self.node_ref_lists:
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
998
for key, (value, refs) in self._root_node.all_items():
983
999
yield (self, key, value, refs)
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1001
for key, (value, refs) in self._root_node.all_items():
986
1002
yield (self, key, value)
988
1004
start_of_leaves = self._row_offsets[-2]
989
1005
end_of_leaves = self._row_offsets[-1]
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1006
needed_offsets = list(range(start_of_leaves, end_of_leaves))
991
1007
if needed_offsets == [0]:
992
1008
# Special case when we only have a root node, as we have already
993
1009
# read everything
998
1014
# for spilling index builds to disk.
999
1015
if self.node_ref_lists:
1000
1016
for _, node in nodes:
1001
for key, (value, refs) in sorted(node.keys.items()):
1017
for key, (value, refs) in node.all_items():
1002
1018
yield (self, key, value, refs)
1004
1020
for _, node in nodes:
1005
for key, (value, refs) in sorted(node.keys.items()):
1021
for key, (value, refs) in node.all_items():
1006
1022
yield (self, key, value)
1031
1047
# function, so there is even more to be gained.
1032
1048
# iter_steps = len(in_keys) + len(fixed_keys)
1033
1049
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1034
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1050
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1051
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1036
1052
# elif bisect_steps < iter_steps:
1038
1054
# for key in in_keys:
1041
1057
# return [(o, offsets[o]) for o in sorted(offsets)]
1042
1058
in_keys_iter = iter(in_keys)
1043
1059
fixed_keys_iter = enumerate(fixed_keys)
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1047
class InputDone(Exception): pass
1048
class FixedDone(Exception): pass
1060
cur_in_key = next(in_keys_iter)
1061
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1063
class InputDone(Exception):
1066
class FixedDone(Exception):
1110
1129
positions = self._multi_bisect_right(sub_keys, node.keys)
1111
1130
node_offset = next_row_start + node.offset
1112
1131
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1113
for pos, s_keys in positions])
1132
for pos, s_keys in positions])
1114
1133
keys_at_index = next_nodes_and_keys
1115
1134
# We should now be at the _LeafNodes
1116
1135
node_indexes = [idx for idx, s_keys in keys_at_index]
1170
1188
node = nodes[node_index]
1171
1189
for next_sub_key in sub_keys:
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1190
if next_sub_key in node:
1191
value, refs = node[next_sub_key]
1174
1192
if self.node_ref_lists:
1175
1193
yield (self, next_sub_key, value, refs)
1209
1227
if ref_list_num >= self.node_ref_lists:
1210
1228
raise ValueError('No ref list %d, index has %d ref lists'
1211
% (ref_list_num, self.node_ref_lists))
1229
% (ref_list_num, self.node_ref_lists))
1213
1231
# The main trick we are trying to accomplish is that when we find a
1214
1232
# key listing its parents, we expect that the parent key is also likely
1244
1262
# sub_keys is all of the keys we are looking for that should exist
1245
1263
# on this page, if they aren't here, then they won't be found
1246
1264
node = nodes[node_index]
1247
node_keys = node.keys
1248
1265
parents_to_check = set()
1249
1266
for next_sub_key in sub_keys:
1250
if next_sub_key not in node_keys:
1267
if next_sub_key not in node:
1251
1268
# This one is just not present in the index at all
1252
1269
missing_keys.add(next_sub_key)
1254
value, refs = node_keys[next_sub_key]
1271
value, refs = node[next_sub_key]
1255
1272
parent_keys = refs[ref_list_num]
1256
1273
parent_map[next_sub_key] = parent_keys
1257
1274
parents_to_check.update(parent_keys)
1264
1281
while parents_to_check:
1265
1282
next_parents_to_check = set()
1266
1283
for key in parents_to_check:
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1285
value, refs = node[key]
1269
1286
parent_keys = refs[ref_list_num]
1270
1287
parent_map[key] = parent_keys
1271
1288
next_parents_to_check.update(parent_keys)
1291
1308
parents_not_on_page.add(key)
1293
# assert key != node.min_key and key != node.max_key
1310
# assert (key != node.min_key and
1311
# key != node.max_key)
1294
1312
# If it was going to be present, it would be on
1295
1313
# *this* page, so mark it missing.
1296
1314
missing_keys.add(key)
1333
1351
self._get_root_node()
1334
1352
# TODO: only access nodes that can satisfy the prefixes we are looking
1335
1353
# for. For now, to meet API usage (as this function is not used by
1336
# current bzrlib) just suck the entire index and iterate in memory.
1354
# current breezy) just suck the entire index and iterate in memory.
1338
1356
if self.node_ref_lists:
1339
1357
if self._key_length == 1:
1365
1383
key_dict[key[-1]] = key_value
1366
1384
if self._key_length == 1:
1367
1385
for key in keys:
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1386
index._sanity_check_key(self, key)
1374
1388
if self.node_ref_lists:
1375
1389
value, node_refs = nodes[key]
1379
1393
except KeyError:
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1396
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1418
1399
def key_count(self):
1419
1400
"""Return an estimate of the number of keys in this index.
1445
1426
signature = bytes[0:len(self._signature())]
1446
1427
if not signature == self._signature():
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1428
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1448
1429
lines = bytes[len(self._signature()):].splitlines()
1449
1430
options_line = lines[0]
1450
1431
if not options_line.startswith(_OPTION_NODE_REFS):
1451
raise errors.BadIndexOptions(self)
1432
raise index.BadIndexOptions(self)
1453
1434
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1454
1435
except ValueError:
1455
raise errors.BadIndexOptions(self)
1436
raise index.BadIndexOptions(self)
1456
1437
options_line = lines[1]
1457
1438
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1458
raise errors.BadIndexOptions(self)
1439
raise index.BadIndexOptions(self)
1460
1441
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1461
1442
except ValueError:
1462
raise errors.BadIndexOptions(self)
1443
raise index.BadIndexOptions(self)
1463
1444
options_line = lines[2]
1464
1445
if not options_line.startswith(_OPTION_LEN):
1465
raise errors.BadIndexOptions(self)
1446
raise index.BadIndexOptions(self)
1467
1448
self._key_count = int(options_line[len(_OPTION_LEN):])
1468
1449
except ValueError:
1469
raise errors.BadIndexOptions(self)
1450
raise index.BadIndexOptions(self)
1470
1451
options_line = lines[3]
1471
1452
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1472
raise errors.BadIndexOptions(self)
1453
raise index.BadIndexOptions(self)
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1455
self._row_lengths = [int(length) for length in
1456
options_line[len(_OPTION_ROW_LENGTHS):].split(
1477
1459
except ValueError:
1478
raise errors.BadIndexOptions(self)
1460
raise index.BadIndexOptions(self)
1479
1461
self._compute_row_offsets()
1481
1463
# calculate the bytes we have processed
1514
1496
self._size = num_bytes - base_offset
1515
1497
# the whole thing should be parsed out of 'bytes'
1516
1498
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1500
base_offset, num_bytes, _PAGE_SIZE)]
1520
1503
if offset > self._size:
1528
1511
elif bytes is not None:
1529
1512
# already have the whole file
1530
data_ranges = [(start, bytes[start:start+size])
1513
data_ranges = [(start, bytes[start:start + size])
1531
1514
for start, size in ranges]
1532
1515
elif self._file is None:
1533
1516
data_ranges = self._transport.readv(self._name, ranges)
1546
1529
bytes = zlib.decompress(data)
1547
1530
if bytes.startswith(_LEAF_FLAG):
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1531
node = self._leaf_factory(bytes, self._key_length,
1532
self.node_ref_lists)
1549
1533
elif bytes.startswith(_INTERNAL_FLAG):
1550
1534
node = _InternalNode(bytes)
1552
1536
raise AssertionError("Unknown node type for %r" % bytes)
1553
yield offset / _PAGE_SIZE, node
1537
yield offset // _PAGE_SIZE, node
1555
1539
def _signature(self):
1556
1540
"""The file signature for this index type."""
1566
1550
# We shouldn't be reading anything anyway
1568
1552
node_end = self._row_offsets[-1]
1569
for node in self._read_nodes(range(start_node, node_end)):
1553
for node in self._read_nodes(list(range(start_node, node_end))):
1557
_gcchk_factory = _LeafNode
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1560
from . import _btree_serializer_pyx as _btree_serializer
1561
_gcchk_factory = _btree_serializer._parse_into_chk
1562
except ImportError as e:
1576
1563
osutils.failed_to_load_extension(e)
1577
from bzrlib import _btree_serializer_py as _btree_serializer
1564
from . import _btree_serializer_py as _btree_serializer