1
# Copyright (C) 2008-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
from __future__ import absolute_import, division
22
from ..lazy_import import lazy_import
23
lazy_import(globals(), """
43
from .index import _OPTION_NODE_REFS, _OPTION_KEY_ELEMENTS, _OPTION_LEN
44
from ..sixish import (
53
_BTSIGNATURE = b"B+Tree Graph Index 2\n"
54
_OPTION_ROW_LENGTHS = b"row_lengths="
55
_LEAF_FLAG = b"type=leaf\n"
56
_INTERNAL_FLAG = b"type=internal\n"
57
_INTERNAL_OFFSET = b"offset="
59
_RESERVED_HEADER_BYTES = 120
62
# 4K per page: 4MB - 1000 entries
63
_NODE_CACHE_SIZE = 1000
66
class _BuilderRow(object):
67
"""The stored state accumulated while writing out a row in the index.
69
:ivar spool: A temporary file used to accumulate nodes for this row
71
:ivar nodes: The count of nodes emitted so far.
75
"""Create a _BuilderRow."""
77
self.spool = None # tempfile.TemporaryFile(prefix='bzr-index-row-')
80
def finish_node(self, pad=True):
81
byte_lines, _, padding = self.writer.finish()
83
self.spool = BytesIO()
85
self.spool.write(b"\x00" * _RESERVED_HEADER_BYTES)
87
# We got bigger than 1 node, switch to a temp file
88
spool = tempfile.TemporaryFile(prefix='bzr-index-row-')
89
spool.write(self.spool.getvalue())
92
if not pad and padding:
94
skipped_bytes = padding
95
self.spool.writelines(byte_lines)
96
remainder = (self.spool.tell() + skipped_bytes) % _PAGE_SIZE
98
raise AssertionError("incorrect node length: %d, %d"
99
% (self.spool.tell(), remainder))
104
class _InternalBuilderRow(_BuilderRow):
105
"""The stored state accumulated while writing out internal rows."""
107
def finish_node(self, pad=True):
109
raise AssertionError("Must pad internal nodes only.")
110
_BuilderRow.finish_node(self)
113
class _LeafBuilderRow(_BuilderRow):
114
"""The stored state accumulated while writing out a leaf rows."""
117
class BTreeBuilder(index.GraphIndexBuilder):
118
"""A Builder for B+Tree based Graph indices.
120
The resulting graph has the structure:
122
_SIGNATURE OPTIONS NODES
123
_SIGNATURE := 'B+Tree Graph Index 1' NEWLINE
124
OPTIONS := REF_LISTS KEY_ELEMENTS LENGTH
125
REF_LISTS := 'node_ref_lists=' DIGITS NEWLINE
126
KEY_ELEMENTS := 'key_elements=' DIGITS NEWLINE
127
LENGTH := 'len=' DIGITS NEWLINE
128
ROW_LENGTHS := 'row_lengths' DIGITS (COMMA DIGITS)*
129
NODES := NODE_COMPRESSED*
130
NODE_COMPRESSED:= COMPRESSED_BYTES{4096}
131
NODE_RAW := INTERNAL | LEAF
132
INTERNAL := INTERNAL_FLAG POINTERS
133
LEAF := LEAF_FLAG ROWS
134
KEY_ELEMENT := Not-whitespace-utf8
135
KEY := KEY_ELEMENT (NULL KEY_ELEMENT)*
137
ROW := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
139
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
140
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
142
VALUE := no-newline-no-null-bytes
145
def __init__(self, reference_lists=0, key_elements=1, spill_at=100000):
146
"""See GraphIndexBuilder.__init__.
148
:param spill_at: Optional parameter controlling the maximum number
149
of nodes that BTreeBuilder will hold in memory.
151
index.GraphIndexBuilder.__init__(self, reference_lists=reference_lists,
152
key_elements=key_elements)
153
self._spill_at = spill_at
154
self._backing_indices = []
155
# A map of {key: (node_refs, value)}
157
# Indicate it hasn't been built yet
158
self._nodes_by_key = None
159
self._optimize_for_size = False
161
def add_node(self, key, value, references=()):
162
"""Add a node to the index.
164
If adding the node causes the builder to reach its spill_at threshold,
165
disk spilling will be triggered.
167
:param key: The key. keys are non-empty tuples containing
168
as many whitespace-free utf8 bytestrings as the key length
169
defined for this index.
170
:param references: An iterable of iterables of keys. Each is a
171
reference to another key.
172
:param value: The value to associate with the key. It may be any
173
bytes as long as it does not contain \\0 or \\n.
175
# Ensure that 'key' is a StaticTuple
176
key = static_tuple.StaticTuple.from_sequence(key).intern()
177
# we don't care about absent_references
178
node_refs, _ = self._check_key_ref_value(key, references, value)
179
if key in self._nodes:
180
raise index.BadIndexDuplicateKey(key, self)
181
self._nodes[key] = static_tuple.StaticTuple(node_refs, value)
182
if self._nodes_by_key is not None and self._key_length > 1:
183
self._update_nodes_by_key(key, value, node_refs)
184
if len(self._nodes) < self._spill_at:
186
self._spill_mem_keys_to_disk()
188
def _spill_mem_keys_to_disk(self):
189
"""Write the in memory keys down to disk to cap memory consumption.
191
If we already have some keys written to disk, we will combine them so
192
as to preserve the sorted order. The algorithm for combining uses
193
powers of two. So on the first spill, write all mem nodes into a
194
single index. On the second spill, combine the mem nodes with the nodes
195
on disk to create a 2x sized disk index and get rid of the first index.
196
On the third spill, create a single new disk index, which will contain
197
the mem nodes, and preserve the existing 2x sized index. On the fourth,
198
combine mem with the first and second indexes, creating a new one of
199
size 4x. On the fifth create a single new one, etc.
201
if self._combine_backing_indices:
202
(new_backing_file, size,
203
backing_pos) = self._spill_mem_keys_and_combine()
205
new_backing_file, size = self._spill_mem_keys_without_combining()
206
# Note: The transport here isn't strictly needed, because we will use
207
# direct access to the new_backing._file object
208
new_backing = BTreeGraphIndex(transport.get_transport_from_path('.'),
210
# GC will clean up the file
211
new_backing._file = new_backing_file
212
if self._combine_backing_indices:
213
if len(self._backing_indices) == backing_pos:
214
self._backing_indices.append(None)
215
self._backing_indices[backing_pos] = new_backing
216
for backing_pos in range(backing_pos):
217
self._backing_indices[backing_pos] = None
219
self._backing_indices.append(new_backing)
221
self._nodes_by_key = None
223
def _spill_mem_keys_without_combining(self):
224
return self._write_nodes(self._iter_mem_nodes(), allow_optimize=False)
226
def _spill_mem_keys_and_combine(self):
227
iterators_to_combine = [self._iter_mem_nodes()]
229
for pos, backing in enumerate(self._backing_indices):
233
iterators_to_combine.append(backing.iter_all_entries())
234
backing_pos = pos + 1
235
new_backing_file, size = \
236
self._write_nodes(self._iter_smallest(iterators_to_combine),
237
allow_optimize=False)
238
return new_backing_file, size, backing_pos
240
def add_nodes(self, nodes):
241
"""Add nodes to the index.
243
:param nodes: An iterable of (key, node_refs, value) entries to add.
245
if self.reference_lists:
246
for (key, value, node_refs) in nodes:
247
self.add_node(key, value, node_refs)
249
for (key, value) in nodes:
250
self.add_node(key, value)
252
def _iter_mem_nodes(self):
253
"""Iterate over the nodes held in memory."""
255
if self.reference_lists:
256
for key in sorted(nodes):
257
references, value = nodes[key]
258
yield self, key, value, references
260
for key in sorted(nodes):
261
references, value = nodes[key]
262
yield self, key, value
264
def _iter_smallest(self, iterators_to_combine):
265
if len(iterators_to_combine) == 1:
266
for value in iterators_to_combine[0]:
270
for iterator in iterators_to_combine:
272
current_values.append(next(iterator))
273
except StopIteration:
274
current_values.append(None)
277
# Decorate candidates with the value to allow 2.4's min to be used.
278
candidates = [(item[1][1], item) for item
279
in enumerate(current_values) if item[1] is not None]
280
if not len(candidates):
282
selected = min(candidates)
283
# undecorate back to (pos, node)
284
selected = selected[1]
285
if last == selected[1][1]:
286
raise index.BadIndexDuplicateKey(last, self)
287
last = selected[1][1]
288
# Yield, with self as the index
289
yield (self,) + selected[1][1:]
292
current_values[pos] = next(iterators_to_combine[pos])
293
except StopIteration:
294
current_values[pos] = None
296
def _add_key(self, string_key, line, rows, allow_optimize=True):
297
"""Add a key to the current chunk.
299
:param string_key: The key to add.
300
:param line: The fully serialised key and value.
301
:param allow_optimize: If set to False, prevent setting the optimize
302
flag when writing out. This is used by the _spill_mem_keys_to_disk
306
if rows[-1].writer is None:
307
# opening a new leaf chunk;
309
for pos, internal_row in enumerate(rows[:-1]):
310
# flesh out any internal nodes that are needed to
311
# preserve the height of the tree
312
if internal_row.writer is None:
314
if internal_row.nodes == 0:
315
length -= _RESERVED_HEADER_BYTES # padded
317
optimize_for_size = self._optimize_for_size
319
optimize_for_size = False
320
internal_row.writer = chunk_writer.ChunkWriter(
321
length, 0, optimize_for_size=optimize_for_size)
322
internal_row.writer.write(_INTERNAL_FLAG)
323
internal_row.writer.write(_INTERNAL_OFFSET
324
+ b"%d\n" % rows[pos + 1].nodes)
327
if rows[-1].nodes == 0:
328
length -= _RESERVED_HEADER_BYTES # padded
329
rows[-1].writer = chunk_writer.ChunkWriter(
330
length, optimize_for_size=self._optimize_for_size)
331
rows[-1].writer.write(_LEAF_FLAG)
332
if rows[-1].writer.write(line):
333
# if we failed to write, despite having an empty page to write to,
334
# then line is too big. raising the error avoids infinite recursion
335
# searching for a suitably large page that will not be found.
337
raise index.BadIndexKey(string_key)
338
# this key did not fit in the node:
339
rows[-1].finish_node()
340
key_line = string_key + b"\n"
342
for row in reversed(rows[:-1]):
343
# Mark the start of the next node in the node above. If it
344
# doesn't fit then propagate upwards until we find one that
346
if row.writer.write(key_line):
349
# We've found a node that can handle the pointer.
352
# If we reached the current root without being able to mark the
353
# division point, then we need a new root:
356
if 'index' in debug.debug_flags:
357
trace.mutter('Inserting new global row.')
358
new_row = _InternalBuilderRow()
360
rows.insert(0, new_row)
361
# This will be padded, hence the -100
362
new_row.writer = chunk_writer.ChunkWriter(
363
_PAGE_SIZE - _RESERVED_HEADER_BYTES,
365
optimize_for_size=self._optimize_for_size)
366
new_row.writer.write(_INTERNAL_FLAG)
367
new_row.writer.write(_INTERNAL_OFFSET
368
+ b"%d\n" % (rows[1].nodes - 1))
369
new_row.writer.write(key_line)
370
self._add_key(string_key, line, rows,
371
allow_optimize=allow_optimize)
373
def _write_nodes(self, node_iterator, allow_optimize=True):
374
"""Write node_iterator out as a B+Tree.
376
:param node_iterator: An iterator of sorted nodes. Each node should
377
match the output given by iter_all_entries.
378
:param allow_optimize: If set to False, prevent setting the optimize
379
flag when writing out. This is used by the _spill_mem_keys_to_disk
381
:return: A file handle for a temporary file containing a B+Tree for
384
# The index rows - rows[0] is the root, rows[1] is the layer under it
387
# forward sorted by key. In future we may consider topological sorting,
388
# at the cost of table scans for direct lookup, or a second index for
391
# A stack with the number of nodes of each size. 0 is the root node
392
# and must always be 1 (if there are any nodes in the tree).
393
self.row_lengths = []
394
# Loop over all nodes adding them to the bottom row
395
# (rows[-1]). When we finish a chunk in a row,
396
# propagate the key that didn't fit (comes after the chunk) to the
397
# row above, transitively.
398
for node in node_iterator:
400
# First key triggers the first row
401
rows.append(_LeafBuilderRow())
403
string_key, line = _btree_serializer._flatten_node(
404
node, self.reference_lists)
405
self._add_key(string_key, line, rows,
406
allow_optimize=allow_optimize)
407
for row in reversed(rows):
408
pad = (not isinstance(row, _LeafBuilderRow))
409
row.finish_node(pad=pad)
410
lines = [_BTSIGNATURE]
411
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
412
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
413
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
414
row_lengths = [row.nodes for row in rows]
415
lines.append(_OPTION_ROW_LENGTHS + ','.join(
416
map(str, row_lengths)).encode('ascii') + b'\n')
417
if row_lengths and row_lengths[-1] > 1:
418
result = tempfile.NamedTemporaryFile(prefix='bzr-index-')
421
result.writelines(lines)
422
position = sum(map(len, lines))
423
if position > _RESERVED_HEADER_BYTES:
424
raise AssertionError("Could not fit the header in the"
425
" reserved space: %d > %d"
426
% (position, _RESERVED_HEADER_BYTES))
427
# write the rows out:
429
reserved = _RESERVED_HEADER_BYTES # reserved space for first node
432
# copy nodes to the finalised file.
433
# Special case the first node as it may be prefixed
434
node = row.spool.read(_PAGE_SIZE)
435
result.write(node[reserved:])
436
if len(node) == _PAGE_SIZE:
437
result.write(b"\x00" * (reserved - position))
438
position = 0 # Only the root row actually has an offset
439
copied_len = osutils.pumpfile(row.spool, result)
440
if copied_len != (row.nodes - 1) * _PAGE_SIZE:
441
if not isinstance(row, _LeafBuilderRow):
442
raise AssertionError("Incorrect amount of data copied"
443
" expected: %d, got: %d"
444
% ((row.nodes - 1) * _PAGE_SIZE,
452
"""Finalise the index.
454
:return: A file handle for a temporary file containing the nodes added
457
return self._write_nodes(self.iter_all_entries())[0]
459
def iter_all_entries(self):
460
"""Iterate over all keys within the index
462
:return: An iterable of (index, key, value, reference_lists). There is
463
no defined order for the result iteration - it will be in the most
464
efficient order for the index (in this case dictionary hash order).
466
if 'evil' in debug.debug_flags:
467
trace.mutter_callsite(
468
3, "iter_all_entries scales with size of history.")
469
# Doing serial rather than ordered would be faster; but this shouldn't
470
# be getting called routinely anyway.
471
iterators = [self._iter_mem_nodes()]
472
for backing in self._backing_indices:
473
if backing is not None:
474
iterators.append(backing.iter_all_entries())
475
if len(iterators) == 1:
477
return self._iter_smallest(iterators)
479
def iter_entries(self, keys):
480
"""Iterate over keys within the index.
482
:param keys: An iterable providing the keys to be retrieved.
483
:return: An iterable of (index, key, value, reference_lists). There is
484
no defined order for the result iteration - it will be in the most
485
efficient order for the index (keys iteration order in this case).
488
# Note: We don't use keys.intersection() here. If you read the C api,
489
# set.intersection(other) special cases when other is a set and
490
# will iterate the smaller of the two and lookup in the other.
491
# It does *not* do this for any other type (even dict, unlike
492
# some other set functions.) Since we expect keys is generally <<
493
# self._nodes, it is faster to iterate over it in a list
496
local_keys = [key for key in keys if key in nodes]
497
if self.reference_lists:
498
for key in local_keys:
500
yield self, key, node[1], node[0]
502
for key in local_keys:
504
yield self, key, node[1]
505
# Find things that are in backing indices that have not been handled
507
if not self._backing_indices:
508
return # We won't find anything there either
509
# Remove all of the keys that we found locally
510
keys.difference_update(local_keys)
511
for backing in self._backing_indices:
516
for node in backing.iter_entries(keys):
518
yield (self,) + node[1:]
520
def iter_entries_prefix(self, keys):
521
"""Iterate over keys within the index using prefix matching.
523
Prefix matching is applied within the tuple of a key, not to within
524
the bytestring of each key element. e.g. if you have the keys ('foo',
525
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
526
only the former key is returned.
528
:param keys: An iterable providing the key prefixes to be retrieved.
529
Each key prefix takes the form of a tuple the length of a key, but
530
with the last N elements 'None' rather than a regular bytestring.
531
The first element cannot be 'None'.
532
:return: An iterable as per iter_all_entries, but restricted to the
533
keys with a matching prefix to those supplied. No additional keys
534
will be returned, and every match that is in the index will be
540
for backing in self._backing_indices:
543
for node in backing.iter_entries_prefix(keys):
544
yield (self,) + node[1:]
545
if self._key_length == 1:
547
index._sanity_check_key(self, key)
549
node = self._nodes[key]
552
if self.reference_lists:
553
yield self, key, node[1], node[0]
555
yield self, key, node[1]
557
nodes_by_key = self._get_nodes_by_key()
558
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
561
def _get_nodes_by_key(self):
562
if self._nodes_by_key is None:
564
if self.reference_lists:
565
for key, (references, value) in viewitems(self._nodes):
566
key_dict = nodes_by_key
567
for subkey in key[:-1]:
568
key_dict = key_dict.setdefault(subkey, {})
569
key_dict[key[-1]] = key, value, references
571
for key, (references, value) in viewitems(self._nodes):
572
key_dict = nodes_by_key
573
for subkey in key[:-1]:
574
key_dict = key_dict.setdefault(subkey, {})
575
key_dict[key[-1]] = key, value
576
self._nodes_by_key = nodes_by_key
577
return self._nodes_by_key
580
"""Return an estimate of the number of keys in this index.
582
For InMemoryGraphIndex the estimate is exact.
584
return len(self._nodes) + sum(
586
for backing in self._backing_indices
587
if backing is not None)
590
"""In memory index's have no known corruption at the moment."""
592
def __lt__(self, other):
593
if isinstance(other, type(self)):
594
return self._nodes < other._nodes
595
# Always sort existing indexes before ones that are still being built.
596
if isinstance(other, BTreeGraphIndex):
601
class _LeafNode(dict):
602
"""A leaf node for a serialised B+Tree index."""
604
__slots__ = ('min_key', 'max_key', '_keys')
606
def __init__(self, bytes, key_length, ref_list_length):
607
"""Parse bytes to create a leaf node object."""
608
# splitlines mangles the \r delimiters.. don't use it.
609
key_list = _btree_serializer._parse_leaf_lines(
610
bytes, key_length, ref_list_length)
612
self.min_key = key_list[0][0]
613
self.max_key = key_list[-1][0]
615
self.min_key = self.max_key = None
616
super(_LeafNode, self).__init__(key_list)
617
self._keys = dict(self)
620
"""Return a sorted list of (key, (value, refs)) items"""
621
items = sorted(self.items())
625
"""Return a sorted list of all keys."""
626
keys = sorted(self.keys())
630
class _InternalNode(object):
631
"""An internal node for a serialised B+Tree index."""
633
__slots__ = ('keys', 'offset')
635
def __init__(self, bytes):
636
"""Parse bytes to create an internal node object."""
637
# splitlines mangles the \r delimiters.. don't use it.
638
self.keys = self._parse_lines(bytes.split(b'\n'))
640
def _parse_lines(self, lines):
642
self.offset = int(lines[1][7:])
643
as_st = static_tuple.StaticTuple.from_sequence
644
for line in lines[2:]:
647
# GZ 2017-05-24: Used to intern() each chunk of line as well, need
648
# to recheck performance and perhaps adapt StaticTuple to adjust.
649
nodes.append(as_st(line.split(b'\0')).intern())
653
class BTreeGraphIndex(object):
654
"""Access to nodes via the standard GraphIndex interface for B+Tree's.
656
Individual nodes are held in a LRU cache. This holds the root node in
657
memory except when very large walks are done.
660
def __init__(self, transport, name, size, unlimited_cache=False,
662
"""Create a B+Tree index object on the index name.
664
:param transport: The transport to read data for the index from.
665
:param name: The file name of the index on transport.
666
:param size: Optional size of the index in bytes. This allows
667
compatibility with the GraphIndex API, as well as ensuring that
668
the initial read (to read the root node header) can be done
669
without over-reading even on empty indices, and on small indices
670
allows single-IO to read the entire index.
671
:param unlimited_cache: If set to True, then instead of using an
672
LRUCache with size _NODE_CACHE_SIZE, we will use a dict and always
673
cache all leaf nodes.
674
:param offset: The start of the btree index data isn't byte 0 of the
675
file. Instead it starts at some point later.
677
self._transport = transport
681
self._recommended_pages = self._compute_recommended_pages()
682
self._root_node = None
683
self._base_offset = offset
684
self._leaf_factory = _LeafNode
685
# Default max size is 100,000 leave values
686
self._leaf_value_cache = None # lru_cache.LRUCache(100*1000)
688
self._leaf_node_cache = {}
689
self._internal_node_cache = {}
691
self._leaf_node_cache = lru_cache.LRUCache(_NODE_CACHE_SIZE)
692
# We use a FIFO here just to prevent possible blowout. However, a
693
# 300k record btree has only 3k leaf nodes, and only 20 internal
694
# nodes. A value of 100 scales to ~100*100*100 = 1M records.
695
self._internal_node_cache = fifo_cache.FIFOCache(100)
696
self._key_count = None
697
self._row_lengths = None
698
self._row_offsets = None # Start of each row, [-1] is the end
703
def __eq__(self, other):
704
"""Equal when self and other were created with the same parameters."""
706
isinstance(self, type(other))
707
and self._transport == other._transport
708
and self._name == other._name
709
and self._size == other._size)
711
def __lt__(self, other):
712
if isinstance(other, type(self)):
713
return ((self._name, self._size) < (other._name, other._size))
714
# Always sort existing indexes before ones that are still being built.
715
if isinstance(other, BTreeBuilder):
719
def __ne__(self, other):
720
return not self.__eq__(other)
722
def _get_and_cache_nodes(self, nodes):
723
"""Read nodes and cache them in the lru.
725
The nodes list supplied is sorted and then read from disk, each node
726
being inserted it into the _node_cache.
728
Note: Asking for more nodes than the _node_cache can contain will
729
result in some of the results being immediately discarded, to prevent
730
this an assertion is raised if more nodes are asked for than are
733
:return: A dict of {node_pos: node}
736
start_of_leaves = None
737
for node_pos, node in self._read_nodes(sorted(nodes)):
738
if node_pos == 0: # Special case
739
self._root_node = node
741
if start_of_leaves is None:
742
start_of_leaves = self._row_offsets[-2]
743
if node_pos < start_of_leaves:
744
self._internal_node_cache[node_pos] = node
746
self._leaf_node_cache[node_pos] = node
747
found[node_pos] = node
750
def _compute_recommended_pages(self):
751
"""Convert transport's recommended_page_size into btree pages.
753
recommended_page_size is in bytes, we want to know how many _PAGE_SIZE
754
pages fit in that length.
756
recommended_read = self._transport.recommended_page_size()
757
recommended_pages = int(math.ceil(recommended_read / _PAGE_SIZE))
758
return recommended_pages
760
def _compute_total_pages_in_index(self):
761
"""How many pages are in the index.
763
If we have read the header we will use the value stored there.
764
Otherwise it will be computed based on the length of the index.
766
if self._size is None:
767
raise AssertionError('_compute_total_pages_in_index should not be'
768
' called when self._size is None')
769
if self._root_node is not None:
770
# This is the number of pages as defined by the header
771
return self._row_offsets[-1]
772
# This is the number of pages as defined by the size of the index. They
773
# should be indentical.
774
total_pages = int(math.ceil(self._size / _PAGE_SIZE))
777
def _expand_offsets(self, offsets):
778
"""Find extra pages to download.
780
The idea is that we always want to make big-enough requests (like 64kB
781
for http), so that we don't waste round trips. So given the entries
782
that we already have cached and the new pages being downloaded figure
783
out what other pages we might want to read.
785
See also doc/developers/btree_index_prefetch.txt for more details.
787
:param offsets: The offsets to be read
788
:return: A list of offsets to download
790
if 'index' in debug.debug_flags:
791
trace.mutter('expanding: %s\toffsets: %s', self._name, offsets)
793
if len(offsets) >= self._recommended_pages:
794
# Don't add more, we are already requesting more than enough
795
if 'index' in debug.debug_flags:
796
trace.mutter(' not expanding large request (%s >= %s)',
797
len(offsets), self._recommended_pages)
799
if self._size is None:
800
# Don't try anything, because we don't know where the file ends
801
if 'index' in debug.debug_flags:
802
trace.mutter(' not expanding without knowing index size')
804
total_pages = self._compute_total_pages_in_index()
805
cached_offsets = self._get_offsets_to_cached_pages()
806
# If reading recommended_pages would read the rest of the index, just
808
if total_pages - len(cached_offsets) <= self._recommended_pages:
809
# Read whatever is left
811
expanded = [x for x in range(total_pages)
812
if x not in cached_offsets]
814
expanded = list(range(total_pages))
815
if 'index' in debug.debug_flags:
816
trace.mutter(' reading all unread pages: %s', expanded)
819
if self._root_node is None:
820
# ATM on the first read of the root node of a large index, we don't
821
# bother pre-reading any other pages. This is because the
822
# likelyhood of actually reading interesting pages is very low.
823
# See doc/developers/btree_index_prefetch.txt for a discussion, and
824
# a possible implementation when we are guessing that the second
825
# layer index is small
826
final_offsets = offsets
828
tree_depth = len(self._row_lengths)
829
if len(cached_offsets) < tree_depth and len(offsets) == 1:
830
# We haven't read enough to justify expansion
831
# If we are only going to read the root node, and 1 leaf node,
832
# then it isn't worth expanding our request. Once we've read at
833
# least 2 nodes, then we are probably doing a search, and we
834
# start expanding our requests.
835
if 'index' in debug.debug_flags:
836
trace.mutter(' not expanding on first reads')
838
final_offsets = self._expand_to_neighbors(offsets, cached_offsets,
841
final_offsets = sorted(final_offsets)
842
if 'index' in debug.debug_flags:
843
trace.mutter('expanded: %s', final_offsets)
846
def _expand_to_neighbors(self, offsets, cached_offsets, total_pages):
847
"""Expand requests to neighbors until we have enough pages.
849
This is called from _expand_offsets after policy has determined that we
851
We only want to expand requests within a given layer. We cheat a little
852
bit and assume all requests will be in the same layer. This is true
853
given the current design, but if it changes this algorithm may perform
856
:param offsets: requested offsets
857
:param cached_offsets: offsets for pages we currently have cached
858
:return: A set() of offsets after expansion
860
final_offsets = set(offsets)
862
new_tips = set(final_offsets)
863
while len(final_offsets) < self._recommended_pages and new_tips:
867
first, end = self._find_layer_first_and_end(pos)
870
previous not in cached_offsets and
871
previous not in final_offsets and
873
next_tips.add(previous)
875
if (after < total_pages and
876
after not in cached_offsets and
877
after not in final_offsets and
880
# This would keep us from going bigger than
881
# recommended_pages by only expanding the first offsets.
882
# However, if we are making a 'wide' request, it is
883
# reasonable to expand all points equally.
884
# if len(final_offsets) > recommended_pages:
886
final_offsets.update(next_tips)
890
def clear_cache(self):
891
"""Clear out any cached/memoized values.
893
This can be called at any time, but generally it is used when we have
894
extracted some information, but don't expect to be requesting any more
897
# Note that we don't touch self._root_node or self._internal_node_cache
898
# We don't expect either of those to be big, and it can save
899
# round-trips in the future. We may re-evaluate this if InternalNode
900
# memory starts to be an issue.
901
self._leaf_node_cache.clear()
903
def external_references(self, ref_list_num):
904
if self._root_node is None:
905
self._get_root_node()
906
if ref_list_num + 1 > self.node_ref_lists:
907
raise ValueError('No ref list %d, index has %d ref lists'
908
% (ref_list_num, self.node_ref_lists))
911
for node in self.iter_all_entries():
913
refs.update(node[3][ref_list_num])
916
def _find_layer_first_and_end(self, offset):
917
"""Find the start/stop nodes for the layer corresponding to offset.
919
:return: (first, end)
920
first is the first node in this layer
921
end is the first node of the next layer
924
for roffset in self._row_offsets:
931
def _get_offsets_to_cached_pages(self):
932
"""Determine what nodes we already have cached."""
933
cached_offsets = set(self._internal_node_cache)
934
# cache may be dict or LRUCache, keys() is the common method
935
cached_offsets.update(self._leaf_node_cache.keys())
936
if self._root_node is not None:
937
cached_offsets.add(0)
938
return cached_offsets
940
def _get_root_node(self):
941
if self._root_node is None:
942
# We may not have a root node yet
943
self._get_internal_nodes([0])
944
return self._root_node
946
def _get_nodes(self, cache, node_indexes):
949
for idx in node_indexes:
950
if idx == 0 and self._root_node is not None:
951
found[0] = self._root_node
954
found[idx] = cache[idx]
959
needed = self._expand_offsets(needed)
960
found.update(self._get_and_cache_nodes(needed))
963
def _get_internal_nodes(self, node_indexes):
964
"""Get a node, from cache or disk.
966
After getting it, the node will be cached.
968
return self._get_nodes(self._internal_node_cache, node_indexes)
970
def _cache_leaf_values(self, nodes):
971
"""Cache directly from key => value, skipping the btree."""
972
if self._leaf_value_cache is not None:
973
for node in viewvalues(nodes):
974
for key, value in node.all_items():
975
if key in self._leaf_value_cache:
976
# Don't add the rest of the keys, we've seen this node
979
self._leaf_value_cache[key] = value
981
def _get_leaf_nodes(self, node_indexes):
982
"""Get a bunch of nodes, from cache or disk."""
983
found = self._get_nodes(self._leaf_node_cache, node_indexes)
984
self._cache_leaf_values(found)
987
def iter_all_entries(self):
988
"""Iterate over all keys within the index.
990
:return: An iterable of (index, key, value) or
991
(index, key, value, reference_lists).
992
The former tuple is used when there are no reference lists in the
993
index, making the API compatible with simple key:value index types.
994
There is no defined order for the result iteration - it will be in
995
the most efficient order for the index.
997
if 'evil' in debug.debug_flags:
998
trace.mutter_callsite(
999
3, "iter_all_entries scales with size of history.")
1000
if not self.key_count():
1002
if self._row_offsets[-1] == 1:
1003
# There is only the root node, and we read that via key_count()
1004
if self.node_ref_lists:
1005
for key, (value, refs) in self._root_node.all_items():
1006
yield (self, key, value, refs)
1008
for key, (value, refs) in self._root_node.all_items():
1009
yield (self, key, value)
1011
start_of_leaves = self._row_offsets[-2]
1012
end_of_leaves = self._row_offsets[-1]
1013
needed_offsets = list(range(start_of_leaves, end_of_leaves))
1014
if needed_offsets == [0]:
1015
# Special case when we only have a root node, as we have already
1017
nodes = [(0, self._root_node)]
1019
nodes = self._read_nodes(needed_offsets)
1020
# We iterate strictly in-order so that we can use this function
1021
# for spilling index builds to disk.
1022
if self.node_ref_lists:
1023
for _, node in nodes:
1024
for key, (value, refs) in node.all_items():
1025
yield (self, key, value, refs)
1027
for _, node in nodes:
1028
for key, (value, refs) in node.all_items():
1029
yield (self, key, value)
1032
def _multi_bisect_right(in_keys, fixed_keys):
1033
"""Find the positions where each 'in_key' would fit in fixed_keys.
1035
This is equivalent to doing "bisect_right" on each in_key into
1038
:param in_keys: A sorted list of keys to match with fixed_keys
1039
:param fixed_keys: A sorted list of keys to match against
1040
:return: A list of (integer position, [key list]) tuples.
1045
# no pointers in the fixed_keys list, which means everything must
1047
return [(0, in_keys)]
1049
# TODO: Iterating both lists will generally take M + N steps
1050
# Bisecting each key will generally take M * log2 N steps.
1051
# If we had an efficient way to compare, we could pick the method
1052
# based on which has the fewer number of steps.
1053
# There is also the argument that bisect_right is a compiled
1054
# function, so there is even more to be gained.
1055
# iter_steps = len(in_keys) + len(fixed_keys)
1056
# bisect_steps = len(in_keys) * math.log(len(fixed_keys), 2)
1057
if len(in_keys) == 1: # Bisect will always be faster for M = 1
1058
return [(bisect.bisect_right(fixed_keys, in_keys[0]), in_keys)]
1059
# elif bisect_steps < iter_steps:
1061
# for key in in_keys:
1062
# offsets.setdefault(bisect_right(fixed_keys, key),
1064
# return [(o, offsets[o]) for o in sorted(offsets)]
1065
in_keys_iter = iter(in_keys)
1066
fixed_keys_iter = enumerate(fixed_keys)
1067
cur_in_key = next(in_keys_iter)
1068
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1070
class InputDone(Exception):
1073
class FixedDone(Exception):
1079
# TODO: Another possibility is that rather than iterating on each side,
1080
# we could use a combination of bisecting and iterating. For
1081
# example, while cur_in_key < fixed_key, bisect to find its
1082
# point, then iterate all matching keys, then bisect (restricted
1083
# to only the remainder) for the next one, etc.
1086
if cur_in_key < cur_fixed_key:
1088
cur_out = (cur_fixed_offset, cur_keys)
1089
output.append(cur_out)
1090
while cur_in_key < cur_fixed_key:
1091
cur_keys.append(cur_in_key)
1093
cur_in_key = next(in_keys_iter)
1094
except StopIteration:
1096
# At this point cur_in_key must be >= cur_fixed_key
1097
# step the cur_fixed_key until we pass the cur key, or walk off
1099
while cur_in_key >= cur_fixed_key:
1101
cur_fixed_offset, cur_fixed_key = next(fixed_keys_iter)
1102
except StopIteration:
1105
# We consumed all of the input, nothing more to do
1108
# There was some input left, but we consumed all of fixed, so we
1109
# have to add one more for the tail
1110
cur_keys = [cur_in_key]
1111
cur_keys.extend(in_keys_iter)
1112
cur_out = (len(fixed_keys), cur_keys)
1113
output.append(cur_out)
1116
def _walk_through_internal_nodes(self, keys):
1117
"""Take the given set of keys, and find the corresponding LeafNodes.
1119
:param keys: An unsorted iterable of keys to search for
1120
:return: (nodes, index_and_keys)
1121
nodes is a dict mapping {index: LeafNode}
1122
keys_at_index is a list of tuples of [(index, [keys for Leaf])]
1124
# 6 seconds spent in miss_torture using the sorted() line.
1125
# Even with out of order disk IO it seems faster not to sort it when
1126
# large queries are being made.
1127
keys_at_index = [(0, sorted(keys))]
1129
for row_pos, next_row_start in enumerate(self._row_offsets[1:-1]):
1130
node_indexes = [idx for idx, s_keys in keys_at_index]
1131
nodes = self._get_internal_nodes(node_indexes)
1133
next_nodes_and_keys = []
1134
for node_index, sub_keys in keys_at_index:
1135
node = nodes[node_index]
1136
positions = self._multi_bisect_right(sub_keys, node.keys)
1137
node_offset = next_row_start + node.offset
1138
next_nodes_and_keys.extend([(node_offset + pos, s_keys)
1139
for pos, s_keys in positions])
1140
keys_at_index = next_nodes_and_keys
1141
# We should now be at the _LeafNodes
1142
node_indexes = [idx for idx, s_keys in keys_at_index]
1144
# TODO: We may *not* want to always read all the nodes in one
1145
# big go. Consider setting a max size on this.
1146
nodes = self._get_leaf_nodes(node_indexes)
1147
return nodes, keys_at_index
1149
def iter_entries(self, keys):
1150
"""Iterate over keys within the index.
1152
:param keys: An iterable providing the keys to be retrieved.
1153
:return: An iterable as per iter_all_entries, but restricted to the
1154
keys supplied. No additional keys will be returned, and every
1155
key supplied that is in the index will be returned.
1157
# 6 seconds spent in miss_torture using the sorted() line.
1158
# Even with out of order disk IO it seems faster not to sort it when
1159
# large queries are being made.
1160
# However, now that we are doing multi-way bisecting, we need the keys
1161
# in sorted order anyway. We could change the multi-way code to not
1162
# require sorted order. (For example, it bisects for the first node,
1163
# does an in-order search until a key comes before the current point,
1164
# which it then bisects for, etc.)
1165
keys = frozenset(keys)
1169
if not self.key_count():
1173
if self._leaf_value_cache is None:
1177
value = self._leaf_value_cache.get(key, None)
1178
if value is not None:
1179
# This key is known not to be here, skip it
1181
if self.node_ref_lists:
1182
yield (self, key, value, refs)
1184
yield (self, key, value)
1186
needed_keys.append(key)
1191
nodes, nodes_and_keys = self._walk_through_internal_nodes(needed_keys)
1192
for node_index, sub_keys in nodes_and_keys:
1195
node = nodes[node_index]
1196
for next_sub_key in sub_keys:
1197
if next_sub_key in node:
1198
value, refs = node[next_sub_key]
1199
if self.node_ref_lists:
1200
yield (self, next_sub_key, value, refs)
1202
yield (self, next_sub_key, value)
1204
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
1205
"""Find the parent_map information for the set of keys.
1207
This populates the parent_map dict and missing_keys set based on the
1208
queried keys. It also can fill out an arbitrary number of parents that
1209
it finds while searching for the supplied keys.
1211
It is unlikely that you want to call this directly. See
1212
"CombinedGraphIndex.find_ancestry()" for a more appropriate API.
1214
:param keys: A keys whose ancestry we want to return
1215
Every key will either end up in 'parent_map' or 'missing_keys'.
1216
:param ref_list_num: This index in the ref_lists is the parents we
1218
:param parent_map: {key: parent_keys} for keys that are present in this
1219
index. This may contain more entries than were in 'keys', that are
1220
reachable ancestors of the keys requested.
1221
:param missing_keys: keys which are known to be missing in this index.
1222
This may include parents that were not directly requested, but we
1223
were able to determine that they are not present in this index.
1224
:return: search_keys parents that were found but not queried to know
1225
if they are missing or present. Callers can re-query this index for
1226
those keys, and they will be placed into parent_map or missing_keys
1228
if not self.key_count():
1229
# We use key_count() to trigger reading the root node and
1230
# determining info about this BTreeGraphIndex
1231
# If we don't have any keys, then everything is missing
1232
missing_keys.update(keys)
1234
if ref_list_num >= self.node_ref_lists:
1235
raise ValueError('No ref list %d, index has %d ref lists'
1236
% (ref_list_num, self.node_ref_lists))
1238
# The main trick we are trying to accomplish is that when we find a
1239
# key listing its parents, we expect that the parent key is also likely
1240
# to sit on the same page. Allowing us to expand parents quickly
1241
# without suffering the full stack of bisecting, etc.
1242
nodes, nodes_and_keys = self._walk_through_internal_nodes(keys)
1244
# These are parent keys which could not be immediately resolved on the
1245
# page where the child was present. Note that we may already be
1246
# searching for that key, and it may actually be present [or known
1247
# missing] on one of the other pages we are reading.
1249
# We could try searching for them in the immediate previous or next
1250
# page. If they occur "later" we could put them in a pending lookup
1251
# set, and then for each node we read thereafter we could check to
1252
# see if they are present.
1253
# However, we don't know the impact of keeping this list of things
1254
# that I'm going to search for every node I come across from here on
1256
# It doesn't handle the case when the parent key is missing on a
1257
# page that we *don't* read. So we already have to handle being
1258
# re-entrant for that.
1259
# Since most keys contain a date string, they are more likely to be
1260
# found earlier in the file than later, but we would know that right
1261
# away (key < min_key), and wouldn't keep searching it on every other
1262
# page that we read.
1263
# Mostly, it is an idea, one which should be benchmarked.
1264
parents_not_on_page = set()
1266
for node_index, sub_keys in nodes_and_keys:
1269
# sub_keys is all of the keys we are looking for that should exist
1270
# on this page, if they aren't here, then they won't be found
1271
node = nodes[node_index]
1272
parents_to_check = set()
1273
for next_sub_key in sub_keys:
1274
if next_sub_key not in node:
1275
# This one is just not present in the index at all
1276
missing_keys.add(next_sub_key)
1278
value, refs = node[next_sub_key]
1279
parent_keys = refs[ref_list_num]
1280
parent_map[next_sub_key] = parent_keys
1281
parents_to_check.update(parent_keys)
1282
# Don't look for things we've already found
1283
parents_to_check = parents_to_check.difference(parent_map)
1284
# this can be used to test the benefit of having the check loop
1286
# parents_not_on_page.update(parents_to_check)
1288
while parents_to_check:
1289
next_parents_to_check = set()
1290
for key in parents_to_check:
1292
value, refs = node[key]
1293
parent_keys = refs[ref_list_num]
1294
parent_map[key] = parent_keys
1295
next_parents_to_check.update(parent_keys)
1297
# This parent either is genuinely missing, or should be
1298
# found on another page. Perf test whether it is better
1299
# to check if this node should fit on this page or not.
1300
# in the 'everything-in-one-pack' scenario, this *not*
1301
# doing the check is 237ms vs 243ms.
1302
# So slightly better, but I assume the standard 'lots
1303
# of packs' is going to show a reasonable improvement
1304
# from the check, because it avoids 'going around
1305
# again' for everything that is in another index
1306
# parents_not_on_page.add(key)
1307
# Missing for some reason
1308
if key < node.min_key:
1309
# in the case of bzr.dev, 3.4k/5.3k misses are
1310
# 'earlier' misses (65%)
1311
parents_not_on_page.add(key)
1312
elif key > node.max_key:
1313
# This parent key would be present on a different
1315
parents_not_on_page.add(key)
1317
# assert (key != node.min_key and
1318
# key != node.max_key)
1319
# If it was going to be present, it would be on
1320
# *this* page, so mark it missing.
1321
missing_keys.add(key)
1322
parents_to_check = next_parents_to_check.difference(parent_map)
1323
# Might want to do another .difference() from missing_keys
1324
# parents_not_on_page could have been found on a different page, or be
1325
# known to be missing. So cull out everything that has already been
1327
search_keys = parents_not_on_page.difference(
1328
parent_map).difference(missing_keys)
1331
def iter_entries_prefix(self, keys):
1332
"""Iterate over keys within the index using prefix matching.
1334
Prefix matching is applied within the tuple of a key, not to within
1335
the bytestring of each key element. e.g. if you have the keys ('foo',
1336
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1337
only the former key is returned.
1339
WARNING: Note that this method currently causes a full index parse
1340
unconditionally (which is reasonably appropriate as it is a means for
1341
thunking many small indices into one larger one and still supplies
1342
iter_all_entries at the thunk layer).
1344
:param keys: An iterable providing the key prefixes to be retrieved.
1345
Each key prefix takes the form of a tuple the length of a key, but
1346
with the last N elements 'None' rather than a regular bytestring.
1347
The first element cannot be 'None'.
1348
:return: An iterable as per iter_all_entries, but restricted to the
1349
keys with a matching prefix to those supplied. No additional keys
1350
will be returned, and every match that is in the index will be
1353
keys = sorted(set(keys))
1356
# Load if needed to check key lengths
1357
if self._key_count is None:
1358
self._get_root_node()
1359
# TODO: only access nodes that can satisfy the prefixes we are looking
1360
# for. For now, to meet API usage (as this function is not used by
1361
# current breezy) just suck the entire index and iterate in memory.
1363
if self.node_ref_lists:
1364
if self._key_length == 1:
1365
for _1, key, value, refs in self.iter_all_entries():
1366
nodes[key] = value, refs
1369
for _1, key, value, refs in self.iter_all_entries():
1370
key_value = key, value, refs
1371
# For a key of (foo, bar, baz) create
1372
# _nodes_by_key[foo][bar][baz] = key_value
1373
key_dict = nodes_by_key
1374
for subkey in key[:-1]:
1375
key_dict = key_dict.setdefault(subkey, {})
1376
key_dict[key[-1]] = key_value
1378
if self._key_length == 1:
1379
for _1, key, value in self.iter_all_entries():
1383
for _1, key, value in self.iter_all_entries():
1384
key_value = key, value
1385
# For a key of (foo, bar, baz) create
1386
# _nodes_by_key[foo][bar][baz] = key_value
1387
key_dict = nodes_by_key
1388
for subkey in key[:-1]:
1389
key_dict = key_dict.setdefault(subkey, {})
1390
key_dict[key[-1]] = key_value
1391
if self._key_length == 1:
1393
index._sanity_check_key(self, key)
1395
if self.node_ref_lists:
1396
value, node_refs = nodes[key]
1397
yield self, key, value, node_refs
1399
yield self, key, nodes[key]
1403
for entry in index._iter_entries_prefix(self, nodes_by_key, keys):
1406
def key_count(self):
1407
"""Return an estimate of the number of keys in this index.
1409
For BTreeGraphIndex the estimate is exact as it is contained in the
1412
if self._key_count is None:
1413
self._get_root_node()
1414
return self._key_count
1416
def _compute_row_offsets(self):
1417
"""Fill out the _row_offsets attribute based on _row_lengths."""
1420
for row in self._row_lengths:
1421
offsets.append(row_offset)
1423
offsets.append(row_offset)
1424
self._row_offsets = offsets
1426
def _parse_header_from_bytes(self, bytes):
1427
"""Parse the header from a region of bytes.
1429
:param bytes: The data to parse.
1430
:return: An offset, data tuple such as readv yields, for the unparsed
1431
data. (which may be of length 0).
1433
signature = bytes[0:len(self._signature())]
1434
if not signature == self._signature():
1435
raise index.BadIndexFormatSignature(self._name, BTreeGraphIndex)
1436
lines = bytes[len(self._signature()):].splitlines()
1437
options_line = lines[0]
1438
if not options_line.startswith(_OPTION_NODE_REFS):
1439
raise index.BadIndexOptions(self)
1441
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
1443
raise index.BadIndexOptions(self)
1444
options_line = lines[1]
1445
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
1446
raise index.BadIndexOptions(self)
1448
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
1450
raise index.BadIndexOptions(self)
1451
options_line = lines[2]
1452
if not options_line.startswith(_OPTION_LEN):
1453
raise index.BadIndexOptions(self)
1455
self._key_count = int(options_line[len(_OPTION_LEN):])
1457
raise index.BadIndexOptions(self)
1458
options_line = lines[3]
1459
if not options_line.startswith(_OPTION_ROW_LENGTHS):
1460
raise index.BadIndexOptions(self)
1462
self._row_lengths = [int(length) for length in
1463
options_line[len(_OPTION_ROW_LENGTHS):].split(
1467
raise index.BadIndexOptions(self)
1468
self._compute_row_offsets()
1470
# calculate the bytes we have processed
1471
header_end = (len(signature) + sum(map(len, lines[0:4])) + 4)
1472
return header_end, bytes[header_end:]
1474
def _read_nodes(self, nodes):
1475
"""Read some nodes from disk into the LRU cache.
1477
This performs a readv to get the node data into memory, and parses each
1478
node, then yields it to the caller. The nodes are requested in the
1479
supplied order. If possible doing sort() on the list before requesting
1480
a read may improve performance.
1482
:param nodes: The nodes to read. 0 - first node, 1 - second node etc.
1485
# may be the byte string of the whole file
1487
# list of (offset, length) regions of the file that should, evenually
1488
# be read in to data_ranges, either from 'bytes' or from the transport
1490
base_offset = self._base_offset
1492
offset = (index * _PAGE_SIZE)
1495
# Root node - special case
1497
size = min(_PAGE_SIZE, self._size)
1499
# The only case where we don't know the size, is for very
1500
# small indexes. So we read the whole thing
1501
bytes = self._transport.get_bytes(self._name)
1502
num_bytes = len(bytes)
1503
self._size = num_bytes - base_offset
1504
# the whole thing should be parsed out of 'bytes'
1505
ranges = [(start, min(_PAGE_SIZE, num_bytes - start))
1507
base_offset, num_bytes, _PAGE_SIZE)]
1510
if offset > self._size:
1511
raise AssertionError('tried to read past the end'
1512
' of the file %s > %s'
1513
% (offset, self._size))
1514
size = min(size, self._size - offset)
1515
ranges.append((base_offset + offset, size))
1518
elif bytes is not None:
1519
# already have the whole file
1520
data_ranges = [(start, bytes[start:start + size])
1521
for start, size in ranges]
1522
elif self._file is None:
1523
data_ranges = self._transport.readv(self._name, ranges)
1526
for offset, size in ranges:
1527
self._file.seek(offset)
1528
data_ranges.append((offset, self._file.read(size)))
1529
for offset, data in data_ranges:
1530
offset -= base_offset
1532
# extract the header
1533
offset, data = self._parse_header_from_bytes(data)
1536
bytes = zlib.decompress(data)
1537
if bytes.startswith(_LEAF_FLAG):
1538
node = self._leaf_factory(bytes, self._key_length,
1539
self.node_ref_lists)
1540
elif bytes.startswith(_INTERNAL_FLAG):
1541
node = _InternalNode(bytes)
1543
raise AssertionError("Unknown node type for %r" % bytes)
1544
yield offset // _PAGE_SIZE, node
1546
def _signature(self):
1547
"""The file signature for this index type."""
1551
"""Validate that everything in the index can be accessed."""
1552
# just read and parse every node.
1553
self._get_root_node()
1554
if len(self._row_lengths) > 1:
1555
start_node = self._row_offsets[1]
1557
# We shouldn't be reading anything anyway
1559
node_end = self._row_offsets[-1]
1560
for node in self._read_nodes(list(range(start_node, node_end))):
1564
_gcchk_factory = _LeafNode
1567
from . import _btree_serializer_pyx as _btree_serializer
1568
_gcchk_factory = _btree_serializer._parse_into_chk
1569
except ImportError as e:
1570
osutils.failed_to_load_extension(e)
1571
from . import _btree_serializer_py as _btree_serializer