1
# Copyright (C) 2008-2011 Canonical Ltd
1
# Copyright (C) 2008, 2009, 2010 Canonical Ltd
3
3
# This program is free software; you can redistribute it and/or modify
4
4
# it under the terms of the GNU General Public License as published by
18
18
"""B+Tree indices"""
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
21
from bisect import bisect_right
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
53
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
54
_OPTION_ROW_LENGTHS = b"row_lengths="
55
_LEAF_FLAG = b"type=leaf\n"
56
_INTERNAL_FLAG = b"type=internal\n"
57
_INTERNAL_OFFSET = b"offset="
37
from bzrlib.index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
38
from bzrlib.transport import get_transport
41
_BTSIGNATURE = "B+Tree Graph Index 2\n"
42
_OPTION_ROW_LENGTHS = "row_lengths="
43
_LEAF_FLAG = "type=leaf\n"
44
_INTERNAL_FLAG = "type=internal\n"
45
_INTERNAL_OFFSET = "offset="
59
47
_RESERVED_HEADER_BYTES = 120
74
62
def __init__(self):
75
63
"""Create a _BuilderRow."""
77
self.spool = None # tempfile.TemporaryFile(prefix='bzr-index-row-')
65
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
80
68
def finish_node(self, pad=True):
81
69
byte_lines, _, padding = self.writer.finish()
82
70
if self.nodes == 0:
83
self.spool = BytesIO()
71
self.spool = cStringIO.StringIO()
85
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
73
self.spool.write("\x00" * _RESERVED_HEADER_BYTES)
86
74
elif self.nodes == 1:
87
75
# We got bigger than 1 node, switch to a temp file
88
76
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
149
137
of nodes that BTreeBuilder will hold in memory.
151
139
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
152
key_elements=key_elements)
140
key_elements=key_elements)
153
141
self._spill_at = spill_at
154
142
self._backing_indices = []
155
143
# A map of {key: (node_refs, value)}
170
158
:param references: An iterable of iterables of keys. Each is a
171
159
reference to another key.
172
160
:param value: The value to associate with the key. It may be any
173
bytes as long as it does not contain \\0 or \\n.
161
bytes as long as it does not contain \0 or \n.
175
163
# Ensure that 'key' is a StaticTuple
176
164
key = static_tuple.StaticTuple.from_sequence(key).intern()
177
165
# we don't care about absent_references
178
166
node_refs, _ = self._check_key_ref_value(key, references, value)
179
167
if key in self._nodes:
180
raise index.BadIndexDuplicateKey(key, self)
168
raise errors.BadIndexDuplicateKey(key, self)
181
169
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
182
170
if self._nodes_by_key is not None and self._key_length > 1:
183
171
self._update_nodes_by_key(key, value, node_refs)
205
193
new_backing_file, size = self._spill_mem_keys_without_combining()
206
194
# Note: The transport here isn't strictly needed, because we will use
207
195
# direct access to the new_backing._file object
208
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
196
new_backing = BTreeGraphIndex(get_transport('.'), '<temp>', size)
210
197
# GC will clean up the file
211
198
new_backing._file = new_backing_file
212
199
if self._combine_backing_indices:
269
256
current_values = []
270
257
for iterator in iterators_to_combine:
272
current_values.append(next(iterator))
259
current_values.append(iterator.next())
273
260
except StopIteration:
274
261
current_values.append(None)
277
264
# Decorate candidates with the value to allow 2.4's min to be used.
278
265
candidates = [(item[1][1], item) for item
279
in enumerate(current_values) if item[1] is not None]
266
in enumerate(current_values) if item[1] is not None]
280
267
if not len(candidates):
282
269
selected = min(candidates)
283
270
# undecorate back to (pos, node)
284
271
selected = selected[1]
285
272
if last == selected[1][1]:
286
raise index.BadIndexDuplicateKey(last, self)
273
raise errors.BadIndexDuplicateKey(last, self)
287
274
last = selected[1][1]
288
275
# Yield, with self as the index
289
276
yield (self,) + selected[1][1:]
290
277
pos = selected[0]
292
current_values[pos] = next(iterators_to_combine[pos])
279
current_values[pos] = iterators_to_combine[pos].next()
293
280
except StopIteration:
294
281
current_values[pos] = None
302
289
flag when writing out. This is used by the _spill_mem_keys_to_disk
306
292
if rows[-1].writer is None:
307
293
# opening a new leaf chunk;
309
294
for pos, internal_row in enumerate(rows[:-1]):
310
295
# flesh out any internal nodes that are needed to
311
296
# preserve the height of the tree
312
297
if internal_row.writer is None:
313
298
length = _PAGE_SIZE
314
299
if internal_row.nodes == 0:
315
length -= _RESERVED_HEADER_BYTES # padded
300
length -= _RESERVED_HEADER_BYTES # padded
316
301
if allow_optimize:
317
302
optimize_for_size = self._optimize_for_size
319
304
optimize_for_size = False
320
internal_row.writer = chunk_writer.ChunkWriter(
321
length, 0, optimize_for_size=optimize_for_size)
305
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
306
optimize_for_size=optimize_for_size)
322
307
internal_row.writer.write(_INTERNAL_FLAG)
323
internal_row.writer.write(_INTERNAL_OFFSET
324
+ b"%d\n" % rows[pos + 1].nodes)
308
internal_row.writer.write(_INTERNAL_OFFSET +
309
str(rows[pos + 1].nodes) + "\n")
326
311
length = _PAGE_SIZE
327
312
if rows[-1].nodes == 0:
328
length -= _RESERVED_HEADER_BYTES # padded
329
rows[-1].writer = chunk_writer.ChunkWriter(
330
length, optimize_for_size=self._optimize_for_size)
313
length -= _RESERVED_HEADER_BYTES # padded
314
rows[-1].writer = chunk_writer.ChunkWriter(length,
315
optimize_for_size=self._optimize_for_size)
331
316
rows[-1].writer.write(_LEAF_FLAG)
332
317
if rows[-1].writer.write(line):
333
# if we failed to write, despite having an empty page to write to,
334
# then line is too big. raising the error avoids infinite recursion
335
# searching for a suitably large page that will not be found.
337
raise index.BadIndexKey(string_key)
338
318
# this key did not fit in the node:
339
319
rows[-1].finish_node()
340
key_line = string_key + b"\n"
320
key_line = string_key + "\n"
342
322
for row in reversed(rows[:-1]):
343
323
# Mark the start of the next node in the node above. If it
365
345
optimize_for_size=self._optimize_for_size)
366
346
new_row.writer.write(_INTERNAL_FLAG)
367
new_row.writer.write(_INTERNAL_OFFSET
368
+ b"%d\n" % (rows[1].nodes - 1))
347
new_row.writer.write(_INTERNAL_OFFSET +
348
str(rows[1].nodes - 1) + "\n")
369
349
new_row.writer.write(key_line)
370
self._add_key(string_key, line, rows,
371
allow_optimize=allow_optimize)
350
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
373
352
def _write_nodes(self, node_iterator, allow_optimize=True):
374
353
"""Write node_iterator out as a B+Tree.
400
379
# First key triggers the first row
401
380
rows.append(_LeafBuilderRow())
403
string_key, line = _btree_serializer._flatten_node(
404
node, self.reference_lists)
405
self._add_key(string_key, line, rows,
406
allow_optimize=allow_optimize)
382
string_key, line = _btree_serializer._flatten_node(node,
383
self.reference_lists)
384
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
407
385
for row in reversed(rows):
408
pad = (not isinstance(row, _LeafBuilderRow))
386
pad = (type(row) != _LeafBuilderRow)
409
387
row.finish_node(pad=pad)
410
388
lines = [_BTSIGNATURE]
411
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
412
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
413
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
389
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
390
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
391
lines.append(_OPTION_LEN + str(key_count) + '\n')
414
392
row_lengths = [row.nodes for row in rows]
415
lines.append(_OPTION_ROW_LENGTHS + ','.join(
416
map(str, row_lengths)).encode('ascii') + b'\n')
393
lines.append(_OPTION_ROW_LENGTHS + ','.join(map(str, row_lengths)) + '\n')
417
394
if row_lengths and row_lengths[-1] > 1:
418
395
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
397
result = cStringIO.StringIO()
421
398
result.writelines(lines)
422
399
position = sum(map(len, lines))
423
401
if position > _RESERVED_HEADER_BYTES:
424
402
raise AssertionError("Could not fit the header in the"
425
403
" reserved space: %d > %d"
426
404
% (position, _RESERVED_HEADER_BYTES))
427
405
# write the rows out:
429
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
407
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
430
408
row.spool.flush()
431
409
row.spool.seek(0)
432
410
# copy nodes to the finalised file.
434
412
node = row.spool.read(_PAGE_SIZE)
435
413
result.write(node[reserved:])
436
414
if len(node) == _PAGE_SIZE:
437
result.write(b"\x00" * (reserved - position))
438
position = 0 # Only the root row actually has an offset
415
result.write("\x00" * (reserved - position))
416
position = 0 # Only the root row actually has an offset
439
417
copied_len = osutils.pumpfile(row.spool, result)
440
418
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
if not isinstance(row, _LeafBuilderRow):
419
if type(row) != _LeafBuilderRow:
442
420
raise AssertionError("Incorrect amount of data copied"
443
" expected: %d, got: %d"
444
% ((row.nodes - 1) * _PAGE_SIZE,
421
" expected: %d, got: %d"
422
% ((row.nodes - 1) * _PAGE_SIZE,
447
425
size = result.tell()
464
442
efficient order for the index (in this case dictionary hash order).
466
444
if 'evil' in debug.debug_flags:
467
trace.mutter_callsite(
468
3, "iter_all_entries scales with size of history.")
445
trace.mutter_callsite(3,
446
"iter_all_entries scales with size of history.")
469
447
# Doing serial rather than ordered would be faster; but this shouldn't
470
448
# be getting called routinely anyway.
471
449
iterators = [self._iter_mem_nodes()]
480
458
"""Iterate over keys within the index.
482
460
:param keys: An iterable providing the keys to be retrieved.
483
:return: An iterable of (index, key, value, reference_lists). There is
484
no defined order for the result iteration - it will be in the most
461
:return: An iterable of (index, key, value, reference_lists). There is no
462
defined order for the result iteration - it will be in the most
485
463
efficient order for the index (keys iteration order in this case).
505
483
# Find things that are in backing indices that have not been handled
507
485
if not self._backing_indices:
508
return # We won't find anything there either
486
return # We won't find anything there either
509
487
# Remove all of the keys that we found locally
510
488
keys.difference_update(local_keys)
511
489
for backing in self._backing_indices:
544
524
yield (self,) + node[1:]
545
525
if self._key_length == 1:
547
index._sanity_check_key(self, key)
529
raise errors.BadIndexKey(key)
530
if len(key) != self._key_length:
531
raise errors.BadIndexKey(key)
549
533
node = self._nodes[key]
555
539
yield self, key, node[1]
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
544
raise errors.BadIndexKey(key)
545
if len(key) != self._key_length:
546
raise errors.BadIndexKey(key)
547
# find what it refers to:
548
key_dict = self._get_nodes_by_key()
550
# find the subdict to return
552
while len(elements) and elements[0] is not None:
553
key_dict = key_dict[elements[0]]
556
# a non-existant lookup.
561
key_dict = dicts.pop(-1)
562
# can't be empty or would not exist
563
item, value = key_dict.iteritems().next()
564
if type(value) == dict:
566
dicts.extend(key_dict.itervalues())
569
for value in key_dict.itervalues():
570
yield (self, ) + tuple(value)
572
yield (self, ) + key_dict
561
574
def _get_nodes_by_key(self):
562
575
if self._nodes_by_key is None:
563
576
nodes_by_key = {}
564
577
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
578
for key, (references, value) in self._nodes.iteritems():
566
579
key_dict = nodes_by_key
567
580
for subkey in key[:-1]:
568
581
key_dict = key_dict.setdefault(subkey, {})
569
582
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
584
for key, (references, value) in self._nodes.iteritems():
572
585
key_dict = nodes_by_key
573
586
for subkey in key[:-1]:
574
587
key_dict = key_dict.setdefault(subkey, {})
582
595
For InMemoryGraphIndex the estimate is exact.
584
return len(self._nodes) + sum(
586
for backing in self._backing_indices
587
if backing is not None)
597
return len(self._nodes) + sum(backing.key_count() for backing in
598
self._backing_indices if backing is not None)
589
600
def validate(self):
590
601
"""In memory index's have no known corruption at the moment."""
592
def __lt__(self, other):
593
if isinstance(other, type(self)):
594
return self._nodes < other._nodes
595
# Always sort existing indexes before ones that are still being built.
596
if isinstance(other, BTreeGraphIndex):
601
class _LeafNode(dict):
604
class _LeafNode(object):
602
605
"""A leaf node for a serialised B+Tree index."""
604
__slots__ = ('min_key', 'max_key', '_keys')
607
__slots__ = ('keys', 'min_key', 'max_key')
606
609
def __init__(self, bytes, key_length, ref_list_length):
607
610
"""Parse bytes to create a leaf node object."""
608
611
# splitlines mangles the \r delimiters.. don't use it.
609
key_list = _btree_serializer._parse_leaf_lines(
610
bytes, key_length, ref_list_length)
612
key_list = _btree_serializer._parse_leaf_lines(bytes,
613
key_length, ref_list_length)
612
615
self.min_key = key_list[0][0]
613
616
self.max_key = key_list[-1][0]
615
618
self.min_key = self.max_key = None
616
super(_LeafNode, self).__init__(key_list)
617
self._keys = dict(self)
620
"""Return a sorted list of (key, (value, refs)) items"""
621
items = sorted(self.items())
625
"""Return a sorted list of all keys."""
626
keys = sorted(self.keys())
619
self.keys = dict(key_list)
630
622
class _InternalNode(object):
635
627
def __init__(self, bytes):
636
628
"""Parse bytes to create an internal node object."""
637
629
# splitlines mangles the \r delimiters.. don't use it.
638
self.keys = self._parse_lines(bytes.split(b'\n'))
630
self.keys = self._parse_lines(bytes.split('\n'))
640
632
def _parse_lines(self, lines):
642
634
self.offset = int(lines[1][7:])
643
635
as_st = static_tuple.StaticTuple.from_sequence
644
636
for line in lines[2:]:
647
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
648
# to recheck performance and perhaps adapt StaticTuple to adjust.
649
nodes.append(as_st(line.split(b'\0')).intern())
639
nodes.append(as_st(map(intern, line.split('\0'))).intern())
681
671
self._recommended_pages = self._compute_recommended_pages()
682
672
self._root_node = None
683
673
self._base_offset = offset
684
self._leaf_factory = _LeafNode
685
674
# Default max size is 100,000 leave values
686
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
675
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
687
676
if unlimited_cache:
688
677
self._leaf_node_cache = {}
689
678
self._internal_node_cache = {}
695
684
self._internal_node_cache = fifo_cache.FIFOCache(100)
696
685
self._key_count = None
697
686
self._row_lengths = None
698
self._row_offsets = None # Start of each row, [-1] is the end
687
self._row_offsets = None # Start of each row, [-1] is the end
703
689
def __eq__(self, other):
704
690
"""Equal when self and other were created with the same parameters."""
706
isinstance(self, type(other))
707
and self._transport == other._transport
708
and self._name == other._name
709
and self._size == other._size)
711
def __lt__(self, other):
712
if isinstance(other, type(self)):
713
return ((self._name, self._size) < (other._name, other._size))
714
# Always sort existing indexes before ones that are still being built.
715
if isinstance(other, BTreeBuilder):
692
type(self) == type(other) and
693
self._transport == other._transport and
694
self._name == other._name and
695
self._size == other._size)
719
697
def __ne__(self, other):
720
698
return not self.__eq__(other)
754
732
pages fit in that length.
756
734
recommended_read = self._transport.recommended_page_size()
757
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
735
recommended_pages = int(math.ceil(recommended_read /
758
737
return recommended_pages
760
739
def _compute_total_pages_in_index(self):
771
750
return self._row_offsets[-1]
772
751
# This is the number of pages as defined by the size of the index. They
773
752
# should be indentical.
774
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
753
total_pages = int(math.ceil(self._size / float(_PAGE_SIZE)))
775
754
return total_pages
777
756
def _expand_offsets(self, offsets):
808
787
if total_pages - len(cached_offsets) <= self._recommended_pages:
809
788
# Read whatever is left
810
789
if cached_offsets:
811
expanded = [x for x in range(total_pages)
812
if x not in cached_offsets]
790
expanded = [x for x in xrange(total_pages)
791
if x not in cached_offsets]
814
expanded = list(range(total_pages))
793
expanded = range(total_pages)
815
794
if 'index' in debug.debug_flags:
816
795
trace.mutter(' reading all unread pages: %s', expanded)
866
845
if first is None:
867
846
first, end = self._find_layer_first_and_end(pos)
868
847
previous = pos - 1
870
previous not in cached_offsets and
871
previous not in final_offsets and
849
and previous not in cached_offsets
850
and previous not in final_offsets
851
and previous >= first):
873
852
next_tips.add(previous)
875
if (after < total_pages and
876
after not in cached_offsets and
877
after not in final_offsets and
854
if (after < total_pages
855
and after not in cached_offsets
856
and after not in final_offsets
879
858
next_tips.add(after)
880
859
# This would keep us from going bigger than
881
860
# recommended_pages by only expanding the first offsets.
905
884
self._get_root_node()
906
885
if ref_list_num + 1 > self.node_ref_lists:
907
886
raise ValueError('No ref list %d, index has %d ref lists'
908
% (ref_list_num, self.node_ref_lists))
887
% (ref_list_num, self.node_ref_lists))
911
890
for node in self.iter_all_entries():
931
910
def _get_offsets_to_cached_pages(self):
932
911
"""Determine what nodes we already have cached."""
933
cached_offsets = set(self._internal_node_cache)
934
# cache may be dict or LRUCache, keys() is the common method
912
cached_offsets = set(self._internal_node_cache.keys())
935
913
cached_offsets.update(self._leaf_node_cache.keys())
936
914
if self._root_node is not None:
937
915
cached_offsets.add(0)
970
948
def _cache_leaf_values(self, nodes):
971
949
"""Cache directly from key => value, skipping the btree."""
972
950
if self._leaf_value_cache is not None:
973
for node in viewvalues(nodes):
974
for key, value in node.all_items():
951
for node in nodes.itervalues():
952
for key, value in node.keys.iteritems():
975
953
if key in self._leaf_value_cache:
976
954
# Don't add the rest of the keys, we've seen this node
987
965
def iter_all_entries(self):
988
966
"""Iterate over all keys within the index.
990
:return: An iterable of (index, key, value) or
991
(index, key, value, reference_lists).
968
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
992
969
The former tuple is used when there are no reference lists in the
993
970
index, making the API compatible with simple key:value index types.
994
971
There is no defined order for the result iteration - it will be in
995
972
the most efficient order for the index.
997
974
if 'evil' in debug.debug_flags:
998
trace.mutter_callsite(
999
3, "iter_all_entries scales with size of history.")
975
trace.mutter_callsite(3,
976
"iter_all_entries scales with size of history.")
1000
977
if not self.key_count():
1002
979
if self._row_offsets[-1] == 1:
1003
980
# There is only the root node, and we read that via key_count()
1004
981
if self.node_ref_lists:
1005
for key, (value, refs) in self._root_node.all_items():
982
for key, (value, refs) in sorted(self._root_node.keys.items()):
1006
983
yield (self, key, value, refs)
1008
for key, (value, refs) in self._root_node.all_items():
985
for key, (value, refs) in sorted(self._root_node.keys.items()):
1009
986
yield (self, key, value)
1011
988
start_of_leaves = self._row_offsets[-2]
1012
989
end_of_leaves = self._row_offsets[-1]
1013
needed_offsets = list(range(start_of_leaves, end_of_leaves))
990
needed_offsets = range(start_of_leaves, end_of_leaves)
1014
991
if needed_offsets == [0]:
1015
992
# Special case when we only have a root node, as we have already
1016
993
# read everything
1021
998
# for spilling index builds to disk.
1022
999
if self.node_ref_lists:
1023
1000
for _, node in nodes:
1024
for key, (value, refs) in node.all_items():
1001
for key, (value, refs) in sorted(node.keys.items()):
1025
1002
yield (self, key, value, refs)
1027
1004
for _, node in nodes:
1028
for key, (value, refs) in node.all_items():
1005
for key, (value, refs) in sorted(node.keys.items()):
1029
1006
yield (self, key, value)
1054
1031
# function, so there is even more to be gained.
1055
1032
# iter_steps = len(in_keys) + len(fixed_keys)
1056
1033
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1057
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1058
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1034
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1035
return [(bisect_right(fixed_keys, in_keys[0]), in_keys)]
1059
1036
# elif bisect_steps < iter_steps:
1061
1038
# for key in in_keys:
1064
1041
# return [(o, offsets[o]) for o in sorted(offsets)]
1065
1042
in_keys_iter = iter(in_keys)
1066
1043
fixed_keys_iter = enumerate(fixed_keys)
1067
cur_in_key = next(in_keys_iter)
1068
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1070
class InputDone(Exception):
1073
class FixedDone(Exception):
1044
cur_in_key = in_keys_iter.next()
1045
cur_fixed_offset, cur_fixed_key = fixed_keys_iter.next()
1047
class InputDone(Exception): pass
1048
class FixedDone(Exception): pass
1136
1110
positions = self._multi_bisect_right(sub_keys, node.keys)
1137
1111
node_offset = next_row_start + node.offset
1138
1112
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1139
for pos, s_keys in positions])
1113
for pos, s_keys in positions])
1140
1114
keys_at_index = next_nodes_and_keys
1141
1115
# We should now be at the _LeafNodes
1142
1116
node_indexes = [idx for idx, s_keys in keys_at_index]
1195
1170
node = nodes[node_index]
1196
1171
for next_sub_key in sub_keys:
1197
if next_sub_key in node:
1198
value, refs = node[next_sub_key]
1172
if next_sub_key in node.keys:
1173
value, refs = node.keys[next_sub_key]
1199
1174
if self.node_ref_lists:
1200
1175
yield (self, next_sub_key, value, refs)
1234
1209
if ref_list_num >= self.node_ref_lists:
1235
1210
raise ValueError('No ref list %d, index has %d ref lists'
1236
% (ref_list_num, self.node_ref_lists))
1211
% (ref_list_num, self.node_ref_lists))
1238
1213
# The main trick we are trying to accomplish is that when we find a
1239
1214
# key listing its parents, we expect that the parent key is also likely
1269
1244
# sub_keys is all of the keys we are looking for that should exist
1270
1245
# on this page, if they aren't here, then they won't be found
1271
1246
node = nodes[node_index]
1247
node_keys = node.keys
1272
1248
parents_to_check = set()
1273
1249
for next_sub_key in sub_keys:
1274
if next_sub_key not in node:
1250
if next_sub_key not in node_keys:
1275
1251
# This one is just not present in the index at all
1276
1252
missing_keys.add(next_sub_key)
1278
value, refs = node[next_sub_key]
1254
value, refs = node_keys[next_sub_key]
1279
1255
parent_keys = refs[ref_list_num]
1280
1256
parent_map[next_sub_key] = parent_keys
1281
1257
parents_to_check.update(parent_keys)
1288
1264
while parents_to_check:
1289
1265
next_parents_to_check = set()
1290
1266
for key in parents_to_check:
1292
value, refs = node[key]
1267
if key in node_keys:
1268
value, refs = node_keys[key]
1293
1269
parent_keys = refs[ref_list_num]
1294
1270
parent_map[key] = parent_keys
1295
1271
next_parents_to_check.update(parent_keys)
1315
1291
parents_not_on_page.add(key)
1317
# assert (key != node.min_key and
1318
# key != node.max_key)
1293
# assert key != node.min_key and key != node.max_key
1319
1294
# If it was going to be present, it would be on
1320
1295
# *this* page, so mark it missing.
1321
1296
missing_keys.add(key)
1358
1333
self._get_root_node()
1359
1334
# TODO: only access nodes that can satisfy the prefixes we are looking
1360
1335
# for. For now, to meet API usage (as this function is not used by
1361
# current breezy) just suck the entire index and iterate in memory.
1336
# current bzrlib) just suck the entire index and iterate in memory.
1363
1338
if self.node_ref_lists:
1364
1339
if self._key_length == 1:
1390
1365
key_dict[key[-1]] = key_value
1391
1366
if self._key_length == 1:
1392
1367
for key in keys:
1393
index._sanity_check_key(self, key)
1370
raise errors.BadIndexKey(key)
1371
if len(key) != self._key_length:
1372
raise errors.BadIndexKey(key)
1395
1374
if self.node_ref_lists:
1396
1375
value, node_refs = nodes[key]
1400
1379
except KeyError:
1403
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1385
raise errors.BadIndexKey(key)
1386
if len(key) != self._key_length:
1387
raise errors.BadIndexKey(key)
1388
# find what it refers to:
1389
key_dict = nodes_by_key
1390
elements = list(key)
1391
# find the subdict whose contents should be returned.
1393
while len(elements) and elements[0] is not None:
1394
key_dict = key_dict[elements[0]]
1397
# a non-existant lookup.
1402
key_dict = dicts.pop(-1)
1403
# can't be empty or would not exist
1404
item, value = key_dict.iteritems().next()
1405
if type(value) == dict:
1407
dicts.extend(key_dict.itervalues())
1410
for value in key_dict.itervalues():
1411
# each value is the key:value:node refs tuple
1413
yield (self, ) + value
1415
# the last thing looked up was a terminal element
1416
yield (self, ) + key_dict
1406
1418
def key_count(self):
1407
1419
"""Return an estimate of the number of keys in this index.
1433
1445
signature = bytes[0:len(self._signature())]
1434
1446
if not signature == self._signature():
1435
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1447
raise errors.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1436
1448
lines = bytes[len(self._signature()):].splitlines()
1437
1449
options_line = lines[0]
1438
1450
if not options_line.startswith(_OPTION_NODE_REFS):
1439
raise index.BadIndexOptions(self)
1451
raise errors.BadIndexOptions(self)
1441
1453
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1442
1454
except ValueError:
1443
raise index.BadIndexOptions(self)
1455
raise errors.BadIndexOptions(self)
1444
1456
options_line = lines[1]
1445
1457
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1446
raise index.BadIndexOptions(self)
1458
raise errors.BadIndexOptions(self)
1448
1460
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1449
1461
except ValueError:
1450
raise index.BadIndexOptions(self)
1462
raise errors.BadIndexOptions(self)
1451
1463
options_line = lines[2]
1452
1464
if not options_line.startswith(_OPTION_LEN):
1453
raise index.BadIndexOptions(self)
1465
raise errors.BadIndexOptions(self)
1455
1467
self._key_count = int(options_line[len(_OPTION_LEN):])
1456
1468
except ValueError:
1457
raise index.BadIndexOptions(self)
1469
raise errors.BadIndexOptions(self)
1458
1470
options_line = lines[3]
1459
1471
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1460
raise index.BadIndexOptions(self)
1472
raise errors.BadIndexOptions(self)
1462
self._row_lengths = [int(length) for length in
1463
options_line[len(_OPTION_ROW_LENGTHS):].split(
1474
self._row_lengths = map(int, [length for length in
1475
options_line[len(_OPTION_ROW_LENGTHS):].split(',')
1466
1477
except ValueError:
1467
raise index.BadIndexOptions(self)
1478
raise errors.BadIndexOptions(self)
1468
1479
self._compute_row_offsets()
1470
1481
# calculate the bytes we have processed
1503
1514
self._size = num_bytes - base_offset
1504
1515
# the whole thing should be parsed out of 'bytes'
1505
1516
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1507
base_offset, num_bytes, _PAGE_SIZE)]
1517
for start in xrange(base_offset, num_bytes, _PAGE_SIZE)]
1510
1520
if offset > self._size:
1518
1528
elif bytes is not None:
1519
1529
# already have the whole file
1520
data_ranges = [(start, bytes[start:start + size])
1530
data_ranges = [(start, bytes[start:start+size])
1521
1531
for start, size in ranges]
1522
1532
elif self._file is None:
1523
1533
data_ranges = self._transport.readv(self._name, ranges)
1536
1546
bytes = zlib.decompress(data)
1537
1547
if bytes.startswith(_LEAF_FLAG):
1538
node = self._leaf_factory(bytes, self._key_length,
1539
self.node_ref_lists)
1548
node = _LeafNode(bytes, self._key_length, self.node_ref_lists)
1540
1549
elif bytes.startswith(_INTERNAL_FLAG):
1541
1550
node = _InternalNode(bytes)
1543
1552
raise AssertionError("Unknown node type for %r" % bytes)
1544
yield offset // _PAGE_SIZE, node
1553
yield offset / _PAGE_SIZE, node
1546
1555
def _signature(self):
1547
1556
"""The file signature for this index type."""
1557
1566
# We shouldn't be reading anything anyway
1559
1568
node_end = self._row_offsets[-1]
1560
for node in self._read_nodes(list(range(start_node, node_end))):
1569
for node in self._read_nodes(range(start_node, node_end)):
1564
_gcchk_factory = _LeafNode
1567
from . import _btree_serializer_pyx as _btree_serializer
1568
_gcchk_factory = _btree_serializer._parse_into_chk
1569
except ImportError as e:
1574
from bzrlib import _btree_serializer_pyx as _btree_serializer
1575
except ImportError, e:
1570
1576
osutils.failed_to_load_extension(e)
1571
from . import _btree_serializer_py as _btree_serializer
1577
from bzrlib import _btree_serializer_py as _btree_serializer