75
67
def __init__(self):
76
68
"""Create a _BuilderRow."""
78
self.spool = None# tempfile.TemporaryFile(prefix='bzr-index-row-')
70
self.spool = None # tempfile.TemporaryFile(prefix='bzr-index-row-')
81
73
def finish_node(self, pad=True):
150
142
of nodes that BTreeBuilder will hold in memory.
152
144
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
153
key_elements=key_elements)
145
key_elements=key_elements)
154
146
self._spill_at = spill_at
155
147
self._backing_indices = []
156
148
# A map of {key: (node_refs, value)}
278
270
# Decorate candidates with the value to allow 2.4's min to be used.
279
271
candidates = [(item[1][1], item) for item
280
in enumerate(current_values) if item[1] is not None]
272
in enumerate(current_values) if item[1] is not None]
281
273
if not len(candidates):
283
275
selected = min(candidates)
313
305
if internal_row.writer is None:
314
306
length = _PAGE_SIZE
315
307
if internal_row.nodes == 0:
316
length -= _RESERVED_HEADER_BYTES # padded
308
length -= _RESERVED_HEADER_BYTES # padded
317
309
if allow_optimize:
318
310
optimize_for_size = self._optimize_for_size
320
312
optimize_for_size = False
321
internal_row.writer = chunk_writer.ChunkWriter(length, 0,
322
optimize_for_size=optimize_for_size)
313
internal_row.writer = chunk_writer.ChunkWriter(
314
length, 0, optimize_for_size=optimize_for_size)
323
315
internal_row.writer.write(_INTERNAL_FLAG)
324
internal_row.writer.write(_INTERNAL_OFFSET +
325
b"%d\n" % rows[pos + 1].nodes)
316
internal_row.writer.write(_INTERNAL_OFFSET
317
+ b"%d\n" % rows[pos + 1].nodes)
327
319
length = _PAGE_SIZE
328
320
if rows[-1].nodes == 0:
329
length -= _RESERVED_HEADER_BYTES # padded
330
rows[-1].writer = chunk_writer.ChunkWriter(length,
331
optimize_for_size=self._optimize_for_size)
321
length -= _RESERVED_HEADER_BYTES # padded
322
rows[-1].writer = chunk_writer.ChunkWriter(
323
length, optimize_for_size=self._optimize_for_size)
332
324
rows[-1].writer.write(_LEAF_FLAG)
333
325
if rows[-1].writer.write(line):
334
326
# if we failed to write, despite having an empty page to write to,
366
358
optimize_for_size=self._optimize_for_size)
367
359
new_row.writer.write(_INTERNAL_FLAG)
368
new_row.writer.write(_INTERNAL_OFFSET +
369
b"%d\n" % (rows[1].nodes - 1))
360
new_row.writer.write(_INTERNAL_OFFSET
361
+ b"%d\n" % (rows[1].nodes - 1))
370
362
new_row.writer.write(key_line)
371
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
363
self._add_key(string_key, line, rows,
364
allow_optimize=allow_optimize)
373
366
def _write_nodes(self, node_iterator, allow_optimize=True):
374
367
"""Write node_iterator out as a B+Tree.
400
393
# First key triggers the first row
401
394
rows.append(_LeafBuilderRow())
403
string_key, line = _btree_serializer._flatten_node(node,
404
self.reference_lists)
405
self._add_key(string_key, line, rows, allow_optimize=allow_optimize)
396
string_key, line = _btree_serializer._flatten_node(
397
node, self.reference_lists)
398
self._add_key(string_key, line, rows,
399
allow_optimize=allow_optimize)
406
400
for row in reversed(rows):
407
401
pad = (not isinstance(row, _LeafBuilderRow))
408
402
row.finish_node(pad=pad)
419
413
result = BytesIO()
420
414
result.writelines(lines)
421
415
position = sum(map(len, lines))
423
416
if position > _RESERVED_HEADER_BYTES:
424
417
raise AssertionError("Could not fit the header in the"
425
418
" reserved space: %d > %d"
426
419
% (position, _RESERVED_HEADER_BYTES))
427
420
# write the rows out:
429
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
422
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
430
423
row.spool.flush()
431
424
row.spool.seek(0)
432
425
# copy nodes to the finalised file.
435
428
result.write(node[reserved:])
436
429
if len(node) == _PAGE_SIZE:
437
430
result.write(b"\x00" * (reserved - position))
438
position = 0 # Only the root row actually has an offset
431
position = 0 # Only the root row actually has an offset
439
432
copied_len = osutils.pumpfile(row.spool, result)
440
433
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
434
if not isinstance(row, _LeafBuilderRow):
442
435
raise AssertionError("Incorrect amount of data copied"
443
" expected: %d, got: %d"
444
% ((row.nodes - 1) * _PAGE_SIZE,
436
" expected: %d, got: %d"
437
% ((row.nodes - 1) * _PAGE_SIZE,
447
440
size = result.tell()
464
457
efficient order for the index (in this case dictionary hash order).
466
459
if 'evil' in debug.debug_flags:
467
trace.mutter_callsite(3,
468
"iter_all_entries scales with size of history.")
460
trace.mutter_callsite(
461
3, "iter_all_entries scales with size of history.")
469
462
# Doing serial rather than ordered would be faster; but this shouldn't
470
463
# be getting called routinely anyway.
471
464
iterators = [self._iter_mem_nodes()]
480
473
"""Iterate over keys within the index.
482
475
:param keys: An iterable providing the keys to be retrieved.
483
:return: An iterable of (index, key, value, reference_lists). There is no
484
defined order for the result iteration - it will be in the most
476
:return: An iterable of (index, key, value, reference_lists). There is
477
no defined order for the result iteration - it will be in the most
485
478
efficient order for the index (keys iteration order in this case).
505
498
# Find things that are in backing indices that have not been handled
507
500
if not self._backing_indices:
508
return # We won't find anything there either
501
return # We won't find anything there either
509
502
# Remove all of the keys that we found locally
510
503
keys.difference_update(local_keys)
511
504
for backing in self._backing_indices:
562
555
if self._nodes_by_key is None:
563
556
nodes_by_key = {}
564
557
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
558
for key, (references, value) in self._nodes.items():
566
559
key_dict = nodes_by_key
567
560
for subkey in key[:-1]:
568
561
key_dict = key_dict.setdefault(subkey, {})
569
562
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
564
for key, (references, value) in self._nodes.items():
572
565
key_dict = nodes_by_key
573
566
for subkey in key[:-1]:
574
567
key_dict = key_dict.setdefault(subkey, {})
582
575
For InMemoryGraphIndex the estimate is exact.
584
return len(self._nodes) + sum(backing.key_count() for backing in
585
self._backing_indices if backing is not None)
577
return len(self._nodes) + sum(
579
for backing in self._backing_indices
580
if backing is not None)
587
582
def validate(self):
588
583
"""In memory index's have no known corruption at the moment."""
585
def __lt__(self, other):
586
if isinstance(other, type(self)):
587
return self._nodes < other._nodes
588
# Always sort existing indexes before ones that are still being built.
589
if isinstance(other, BTreeGraphIndex):
591
594
class _LeafNode(dict):
592
595
"""A leaf node for a serialised B+Tree index."""
596
599
def __init__(self, bytes, key_length, ref_list_length):
597
600
"""Parse bytes to create a leaf node object."""
598
601
# splitlines mangles the \r delimiters.. don't use it.
599
key_list = _btree_serializer._parse_leaf_lines(bytes,
600
key_length, ref_list_length)
602
key_list = _btree_serializer._parse_leaf_lines(
603
bytes, key_length, ref_list_length)
602
605
self.min_key = key_list[0][0]
603
606
self.max_key = key_list[-1][0]
673
676
self._base_offset = offset
674
677
self._leaf_factory = _LeafNode
675
678
# Default max size is 100,000 leave values
676
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
679
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
677
680
if unlimited_cache:
678
681
self._leaf_node_cache = {}
679
682
self._internal_node_cache = {}
693
696
def __eq__(self, other):
694
697
"""Equal when self and other were created with the same parameters."""
696
isinstance(self, type(other)) and
697
self._transport == other._transport and
698
self._name == other._name and
699
self._size == other._size)
699
isinstance(self, type(other))
700
and self._transport == other._transport
701
and self._name == other._name
702
and self._size == other._size)
704
def __lt__(self, other):
705
if isinstance(other, type(self)):
706
return ((self._name, self._size) < (other._name, other._size))
707
# Always sort existing indexes before ones that are still being built.
708
if isinstance(other, BTreeBuilder):
701
712
def __ne__(self, other):
702
713
return not self.__eq__(other)
718
729
start_of_leaves = None
719
730
for node_pos, node in self._read_nodes(sorted(nodes)):
720
if node_pos == 0: # Special case
731
if node_pos == 0: # Special case
721
732
self._root_node = node
723
734
if start_of_leaves is None:
791
802
# Read whatever is left
792
803
if cached_offsets:
793
804
expanded = [x for x in range(total_pages)
794
if x not in cached_offsets]
805
if x not in cached_offsets]
796
807
expanded = list(range(total_pages))
797
808
if 'index' in debug.debug_flags:
848
859
if first is None:
849
860
first, end = self._find_layer_first_and_end(pos)
850
861
previous = pos - 1
852
and previous not in cached_offsets
853
and previous not in final_offsets
854
and previous >= first):
863
previous not in cached_offsets and
864
previous not in final_offsets and
855
866
next_tips.add(previous)
857
if (after < total_pages
858
and after not in cached_offsets
859
and after not in final_offsets
868
if (after < total_pages and
869
after not in cached_offsets and
870
after not in final_offsets and
861
872
next_tips.add(after)
862
873
# This would keep us from going bigger than
863
874
# recommended_pages by only expanding the first offsets.
887
898
self._get_root_node()
888
899
if ref_list_num + 1 > self.node_ref_lists:
889
900
raise ValueError('No ref list %d, index has %d ref lists'
890
% (ref_list_num, self.node_ref_lists))
901
% (ref_list_num, self.node_ref_lists))
893
904
for node in self.iter_all_entries():
952
963
def _cache_leaf_values(self, nodes):
953
964
"""Cache directly from key => value, skipping the btree."""
954
965
if self._leaf_value_cache is not None:
955
for node in viewvalues(nodes):
966
for node in nodes.values():
956
967
for key, value in node.all_items():
957
968
if key in self._leaf_value_cache:
958
969
# Don't add the rest of the keys, we've seen this node
969
980
def iter_all_entries(self):
970
981
"""Iterate over all keys within the index.
972
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
983
:return: An iterable of (index, key, value) or
984
(index, key, value, reference_lists).
973
985
The former tuple is used when there are no reference lists in the
974
986
index, making the API compatible with simple key:value index types.
975
987
There is no defined order for the result iteration - it will be in
976
988
the most efficient order for the index.
978
990
if 'evil' in debug.debug_flags:
979
trace.mutter_callsite(3,
980
"iter_all_entries scales with size of history.")
991
trace.mutter_callsite(
992
3, "iter_all_entries scales with size of history.")
981
993
if not self.key_count():
983
995
if self._row_offsets[-1] == 1:
1035
1047
# function, so there is even more to be gained.
1036
1048
# iter_steps = len(in_keys) + len(fixed_keys)
1037
1049
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1038
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1050
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1039
1051
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1040
1052
# elif bisect_steps < iter_steps:
1048
1060
cur_in_key = next(in_keys_iter)
1049
1061
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1051
class InputDone(Exception): pass
1052
class FixedDone(Exception): pass
1063
class InputDone(Exception):
1066
class FixedDone(Exception):
1114
1129
positions = self._multi_bisect_right(sub_keys, node.keys)
1115
1130
node_offset = next_row_start + node.offset
1116
1131
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1117
for pos, s_keys in positions])
1132
for pos, s_keys in positions])
1118
1133
keys_at_index = next_nodes_and_keys
1119
1134
# We should now be at the _LeafNodes
1120
1135
node_indexes = [idx for idx, s_keys in keys_at_index]
1213
1227
if ref_list_num >= self.node_ref_lists:
1214
1228
raise ValueError('No ref list %d, index has %d ref lists'
1215
% (ref_list_num, self.node_ref_lists))
1229
% (ref_list_num, self.node_ref_lists))
1217
1231
# The main trick we are trying to accomplish is that when we find a
1218
1232
# key listing its parents, we expect that the parent key is also likely
1294
1308
parents_not_on_page.add(key)
1296
# assert key != node.min_key and key != node.max_key
1310
# assert (key != node.min_key and
1311
# key != node.max_key)
1297
1312
# If it was going to be present, it would be on
1298
1313
# *this* page, so mark it missing.
1299
1314
missing_keys.add(key)
1438
1453
raise index.BadIndexOptions(self)
1440
1455
self._row_lengths = [int(length) for length in
1441
options_line[len(_OPTION_ROW_LENGTHS):].split(b',')
1456
options_line[len(_OPTION_ROW_LENGTHS):].split(
1443
1459
except ValueError:
1444
1460
raise index.BadIndexOptions(self)
1445
1461
self._compute_row_offsets()
1480
1496
self._size = num_bytes - base_offset
1481
1497
# the whole thing should be parsed out of 'bytes'
1482
1498
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1483
for start in range(base_offset, num_bytes, _PAGE_SIZE)]
1500
base_offset, num_bytes, _PAGE_SIZE)]
1486
1503
if offset > self._size:
1494
1511
elif bytes is not None:
1495
1512
# already have the whole file
1496
data_ranges = [(start, bytes[start:start+size])
1513
data_ranges = [(start, bytes[start:start + size])
1497
1514
for start, size in ranges]
1498
1515
elif self._file is None:
1499
1516
data_ranges = self._transport.readv(self._name, ranges)