1
# Copyright (C) 2007-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Indexing facilities."""
19
from __future__ import absolute_import
25
'GraphIndexPrefixAdapter',
29
from bisect import bisect_right
32
from ..lazy_import import lazy_import
33
lazy_import(globals(), """
36
revision as _mod_revision,
44
from ..sixish import (
51
from ..static_tuple import StaticTuple
53
_HEADER_READV = (0, 200)
54
_OPTION_KEY_ELEMENTS = b"key_elements="
56
_OPTION_NODE_REFS = b"node_ref_lists="
57
_SIGNATURE = b"Bazaar Graph Index 1\n"
60
class BadIndexFormatSignature(errors.BzrError):
62
_fmt = "%(value)s is not an index of type %(_type)s."
64
def __init__(self, value, _type):
65
errors.BzrError.__init__(self)
70
class BadIndexData(errors.BzrError):
72
_fmt = "Error in data for index %(value)s."
74
def __init__(self, value):
75
errors.BzrError.__init__(self)
79
class BadIndexDuplicateKey(errors.BzrError):
81
_fmt = "The key '%(key)s' is already in index '%(index)s'."
83
def __init__(self, key, index):
84
errors.BzrError.__init__(self)
89
class BadIndexKey(errors.BzrError):
91
_fmt = "The key '%(key)s' is not a valid key."
93
def __init__(self, key):
94
errors.BzrError.__init__(self)
98
class BadIndexOptions(errors.BzrError):
100
_fmt = "Could not parse options for index %(value)s."
102
def __init__(self, value):
103
errors.BzrError.__init__(self)
107
class BadIndexValue(errors.BzrError):
109
_fmt = "The value '%(value)s' is not a valid value."
111
def __init__(self, value):
112
errors.BzrError.__init__(self)
116
_whitespace_re = re.compile(b'[\t\n\x0b\x0c\r\x00 ]')
117
_newline_null_re = re.compile(b'[\n\0]')
120
def _has_key_from_parent_map(self, key):
121
"""Check if this index has one key.
123
If it's possible to check for multiple keys at once through
124
calling get_parent_map that should be faster.
126
return (key in self.get_parent_map([key]))
129
def _missing_keys_from_parent_map(self, keys):
130
return set(keys) - set(self.get_parent_map(keys))
133
class GraphIndexBuilder(object):
134
"""A builder that can build a GraphIndex.
136
The resulting graph has the structure::
138
_SIGNATURE OPTIONS NODES NEWLINE
139
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
140
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
142
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
143
KEY := Not-whitespace-utf8
145
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
146
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
147
REFERENCE := DIGITS ; digits is the byte offset in the index of the
149
VALUE := no-newline-no-null-bytes
152
def __init__(self, reference_lists=0, key_elements=1):
153
"""Create a GraphIndex builder.
155
:param reference_lists: The number of node references lists for each
157
:param key_elements: The number of bytestrings in each key.
159
self.reference_lists = reference_lists
160
# A dict of {key: (absent, ref_lists, value)}
162
# Keys that are referenced but not actually present in this index
163
self._absent_keys = set()
164
self._nodes_by_key = None
165
self._key_length = key_elements
166
self._optimize_for_size = False
167
self._combine_backing_indices = True
169
def _check_key(self, key):
170
"""Raise BadIndexKey if key is not a valid key for this index."""
171
if type(key) not in (tuple, StaticTuple):
172
raise BadIndexKey(key)
173
if self._key_length != len(key):
174
raise BadIndexKey(key)
176
if not element or type(element) != bytes or _whitespace_re.search(element) is not None:
177
raise BadIndexKey(key)
179
def _external_references(self):
180
"""Return references that are not present in this index.
184
# TODO: JAM 2008-11-21 This makes an assumption about how the reference
185
# lists are used. It is currently correct for pack-0.92 through
186
# 1.9, which use the node references (3rd column) second
187
# reference list as the compression parent. Perhaps this should
188
# be moved into something higher up the stack, since it
189
# makes assumptions about how the index is used.
190
if self.reference_lists > 1:
191
for node in self.iter_all_entries():
193
refs.update(node[3][1])
196
# If reference_lists == 0 there can be no external references, and
197
# if reference_lists == 1, then there isn't a place to store the
201
def _get_nodes_by_key(self):
202
if self._nodes_by_key is None:
204
if self.reference_lists:
205
for key, (absent, references, value) in viewitems(self._nodes):
208
key_dict = nodes_by_key
209
for subkey in key[:-1]:
210
key_dict = key_dict.setdefault(subkey, {})
211
key_dict[key[-1]] = key, value, references
213
for key, (absent, references, value) in viewitems(self._nodes):
216
key_dict = nodes_by_key
217
for subkey in key[:-1]:
218
key_dict = key_dict.setdefault(subkey, {})
219
key_dict[key[-1]] = key, value
220
self._nodes_by_key = nodes_by_key
221
return self._nodes_by_key
223
def _update_nodes_by_key(self, key, value, node_refs):
224
"""Update the _nodes_by_key dict with a new key.
226
For a key of (foo, bar, baz) create
227
_nodes_by_key[foo][bar][baz] = key_value
229
if self._nodes_by_key is None:
231
key_dict = self._nodes_by_key
232
if self.reference_lists:
233
key_value = StaticTuple(key, value, node_refs)
235
key_value = StaticTuple(key, value)
236
for subkey in key[:-1]:
237
key_dict = key_dict.setdefault(subkey, {})
238
key_dict[key[-1]] = key_value
240
def _check_key_ref_value(self, key, references, value):
241
"""Check that 'key' and 'references' are all valid.
243
:param key: A key tuple. Must conform to the key interface (be a tuple,
244
be of the right length, not have any whitespace or nulls in any key
246
:param references: An iterable of reference lists. Something like
247
[[(ref, key)], [(ref, key), (other, key)]]
248
:param value: The value associate with this key. Must not contain
249
newlines or null characters.
250
:return: (node_refs, absent_references)
252
* node_refs: basically a packed form of 'references' where all
254
* absent_references: reference keys that are not in self._nodes.
255
This may contain duplicates if the same key is referenced in
258
as_st = StaticTuple.from_sequence
260
if _newline_null_re.search(value) is not None:
261
raise BadIndexValue(value)
262
if len(references) != self.reference_lists:
263
raise BadIndexValue(references)
265
absent_references = []
266
for reference_list in references:
267
for reference in reference_list:
268
# If reference *is* in self._nodes, then we know it has already
270
if reference not in self._nodes:
271
self._check_key(reference)
272
absent_references.append(reference)
273
reference_list = as_st([as_st(ref).intern()
274
for ref in reference_list])
275
node_refs.append(reference_list)
276
return as_st(node_refs), absent_references
278
def add_node(self, key, value, references=()):
279
"""Add a node to the index.
281
:param key: The key. keys are non-empty tuples containing
282
as many whitespace-free utf8 bytestrings as the key length
283
defined for this index.
284
:param references: An iterable of iterables of keys. Each is a
285
reference to another key.
286
:param value: The value to associate with the key. It may be any
287
bytes as long as it does not contain \\0 or \\n.
290
absent_references) = self._check_key_ref_value(key, references, value)
291
if key in self._nodes and self._nodes[key][0] != b'a':
292
raise BadIndexDuplicateKey(key, self)
293
for reference in absent_references:
294
# There may be duplicates, but I don't think it is worth worrying
296
self._nodes[reference] = (b'a', (), b'')
297
self._absent_keys.update(absent_references)
298
self._absent_keys.discard(key)
299
self._nodes[key] = (b'', node_refs, value)
300
if self._nodes_by_key is not None and self._key_length > 1:
301
self._update_nodes_by_key(key, value, node_refs)
303
def clear_cache(self):
304
"""See GraphIndex.clear_cache()
306
This is a no-op, but we need the api to conform to a generic 'Index'
313
:returns: cBytesIO holding the full context of the index as it
314
should be written to disk.
317
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
318
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
319
key_count = len(self._nodes) - len(self._absent_keys)
320
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
321
prefix_length = sum(len(x) for x in lines)
322
# references are byte offsets. To avoid having to do nasty
323
# polynomial work to resolve offsets (references to later in the
324
# file cannot be determined until all the inbetween references have
325
# been calculated too) we pad the offsets with 0's to make them be
326
# of consistent length. Using binary offsets would break the trivial
328
# to calculate the width of zero's needed we do three passes:
329
# one to gather all the non-reference data and the number of references.
330
# one to pad all the data with reference-length and determine entry
334
# forward sorted by key. In future we may consider topological sorting,
335
# at the cost of table scans for direct lookup, or a second index for
337
nodes = sorted(viewitems(self._nodes))
338
# if we do not prepass, we don't know how long it will be up front.
339
expected_bytes = None
340
# we only need to pre-pass if we have reference lists at all.
341
if self.reference_lists:
343
non_ref_bytes = prefix_length
345
# TODO use simple multiplication for the constants in this loop.
346
for key, (absent, references, value) in nodes:
347
# record the offset known *so far* for this key:
348
# the non reference bytes to date, and the total references to
349
# date - saves reaccumulating on the second pass
350
key_offset_info.append((key, non_ref_bytes, total_references))
351
# key is literal, value is literal, there are 3 null's, 1 NL
352
# key is variable length tuple, \x00 between elements
353
non_ref_bytes += sum(len(element) for element in key)
354
if self._key_length > 1:
355
non_ref_bytes += self._key_length - 1
356
# value is literal bytes, there are 3 null's, 1 NL.
357
non_ref_bytes += len(value) + 3 + 1
358
# one byte for absent if set.
361
elif self.reference_lists:
362
# (ref_lists -1) tabs
363
non_ref_bytes += self.reference_lists - 1
364
# (ref-1 cr's per ref_list)
365
for ref_list in references:
366
# how many references across the whole file?
367
total_references += len(ref_list)
368
# accrue reference separators
370
non_ref_bytes += len(ref_list) - 1
371
# how many digits are needed to represent the total byte count?
373
possible_total_bytes = non_ref_bytes + total_references*digits
374
while 10 ** digits < possible_total_bytes:
376
possible_total_bytes = non_ref_bytes + total_references*digits
377
expected_bytes = possible_total_bytes + 1 # terminating newline
378
# resolve key addresses.
380
for key, non_ref_bytes, total_references in key_offset_info:
381
key_addresses[key] = non_ref_bytes + total_references*digits
383
format_string = b'%%0%dd' % digits
384
for key, (absent, references, value) in nodes:
385
flattened_references = []
386
for ref_list in references:
388
for reference in ref_list:
389
ref_addresses.append(format_string % key_addresses[reference])
390
flattened_references.append(b'\r'.join(ref_addresses))
391
string_key = b'\x00'.join(key)
392
lines.append(b"%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
393
b'\t'.join(flattened_references), value))
395
result = BytesIO(b''.join(lines))
396
if expected_bytes and len(result.getvalue()) != expected_bytes:
397
raise errors.BzrError('Failed index creation. Internal error:'
398
' mismatched output length and expected length: %d %d' %
399
(len(result.getvalue()), expected_bytes))
402
def set_optimize(self, for_size=None, combine_backing_indices=None):
403
"""Change how the builder tries to optimize the result.
405
:param for_size: Tell the builder to try and make the index as small as
407
:param combine_backing_indices: If the builder spills to disk to save
408
memory, should the on-disk indices be combined. Set to True if you
409
are going to be probing the index, but to False if you are not. (If
410
you are not querying, then the time spent combining is wasted.)
413
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
415
if for_size is not None:
416
self._optimize_for_size = for_size
417
if combine_backing_indices is not None:
418
self._combine_backing_indices = combine_backing_indices
420
def find_ancestry(self, keys, ref_list_num):
421
"""See CombinedGraphIndex.find_ancestry()"""
427
for _, key, value, ref_lists in self.iter_entries(pending):
428
parent_keys = ref_lists[ref_list_num]
429
parent_map[key] = parent_keys
430
next_pending.update([p for p in parent_keys if p not in
432
missing_keys.update(pending.difference(parent_map))
433
pending = next_pending
434
return parent_map, missing_keys
437
class GraphIndex(object):
438
"""An index for data with embedded graphs.
440
The index maps keys to a list of key reference lists, and a value.
441
Each node has the same number of key reference lists. Each key reference
442
list can be empty or an arbitrary length. The value is an opaque NULL
443
terminated string without any newlines. The storage of the index is
444
hidden in the interface: keys and key references are always tuples of
445
bytestrings, never the internal representation (e.g. dictionary offsets).
447
It is presumed that the index will not be mutated - it is static data.
449
Successive iter_all_entries calls will read the entire index each time.
450
Additionally, iter_entries calls will read the index linearly until the
451
desired keys are found. XXX: This must be fixed before the index is
452
suitable for production use. :XXX
455
def __init__(self, transport, name, size, unlimited_cache=False, offset=0):
456
"""Open an index called name on transport.
458
:param transport: A breezy.transport.Transport.
459
:param name: A path to provide to transport API calls.
460
:param size: The size of the index in bytes. This is used for bisection
461
logic to perform partial index reads. While the size could be
462
obtained by statting the file this introduced an additional round
463
trip as well as requiring stat'able transports, both of which are
464
avoided by having it supplied. If size is None, then bisection
465
support will be disabled and accessing the index will just stream
467
:param offset: Instead of starting the index data at offset 0, start it
468
at an arbitrary offset.
470
self._transport = transport
472
# Becomes a dict of key:(value, reference-list-byte-locations) used by
473
# the bisection interface to store parsed but not resolved keys.
474
self._bisect_nodes = None
475
# Becomes a dict of key:(value, reference-list-keys) which are ready to
476
# be returned directly to callers.
478
# a sorted list of slice-addresses for the parsed bytes of the file.
479
# e.g. (0,1) would mean that byte 0 is parsed.
480
self._parsed_byte_map = []
481
# a sorted list of keys matching each slice address for parsed bytes
482
# e.g. (None, 'foo@bar') would mean that the first byte contained no
483
# key, and the end byte of the slice is the of the data for 'foo@bar'
484
self._parsed_key_map = []
485
self._key_count = None
486
self._keys_by_offset = None
487
self._nodes_by_key = None
489
# The number of bytes we've read so far in trying to process this file
491
self._base_offset = offset
493
def __eq__(self, other):
494
"""Equal when self and other were created with the same parameters."""
496
isinstance(self, type(other)) and
497
self._transport == other._transport and
498
self._name == other._name and
499
self._size == other._size)
501
def __ne__(self, other):
502
return not self.__eq__(other)
504
def __lt__(self, other):
505
# We don't really care about the order, just that there is an order.
506
if (not isinstance(other, GraphIndex) and
507
not isinstance(other, InMemoryGraphIndex)):
508
raise TypeError(other)
509
return hash(self) < hash(other)
512
return hash((type(self), self._transport, self._name, self._size))
515
return "%s(%r)" % (self.__class__.__name__,
516
self._transport.abspath(self._name))
518
def _buffer_all(self, stream=None):
519
"""Buffer all the index data.
521
Mutates self._nodes and self.keys_by_offset.
523
if self._nodes is not None:
524
# We already did this
526
if 'index' in debug.debug_flags:
527
trace.mutter('Reading entire index %s',
528
self._transport.abspath(self._name))
530
stream = self._transport.get(self._name)
531
if self._base_offset != 0:
532
# This is wasteful, but it is better than dealing with
533
# adjusting all the offsets, etc.
534
stream = BytesIO(stream.read()[self._base_offset:])
536
self._read_prefix(stream)
537
self._expected_elements = 3 + self._key_length
539
# raw data keyed by offset
540
self._keys_by_offset = {}
541
# ready-to-return key:value or key:value, node_ref_lists
543
self._nodes_by_key = None
546
lines = stream.read().split(b'\n')
550
_, _, _, trailers = self._parse_lines(lines, pos)
551
for key, absent, references, value in viewvalues(self._keys_by_offset):
554
# resolve references:
555
if self.node_ref_lists:
556
node_value = (value, self._resolve_references(references))
559
self._nodes[key] = node_value
560
# cache the keys for quick set intersections
562
# there must be one line - the empty trailer line.
563
raise BadIndexData(self)
565
def clear_cache(self):
566
"""Clear out any cached/memoized values.
568
This can be called at any time, but generally it is used when we have
569
extracted some information, but don't expect to be requesting any more
573
def external_references(self, ref_list_num):
574
"""Return references that are not present in this index.
577
if ref_list_num + 1 > self.node_ref_lists:
578
raise ValueError('No ref list %d, index has %d ref lists'
579
% (ref_list_num, self.node_ref_lists))
582
for key, (value, ref_lists) in viewitems(nodes):
583
ref_list = ref_lists[ref_list_num]
584
refs.update([ref for ref in ref_list if ref not in nodes])
587
def _get_nodes_by_key(self):
588
if self._nodes_by_key is None:
590
if self.node_ref_lists:
591
for key, (value, references) in viewitems(self._nodes):
592
key_dict = nodes_by_key
593
for subkey in key[:-1]:
594
key_dict = key_dict.setdefault(subkey, {})
595
key_dict[key[-1]] = key, value, references
597
for key, value in viewitems(self._nodes):
598
key_dict = nodes_by_key
599
for subkey in key[:-1]:
600
key_dict = key_dict.setdefault(subkey, {})
601
key_dict[key[-1]] = key, value
602
self._nodes_by_key = nodes_by_key
603
return self._nodes_by_key
605
def iter_all_entries(self):
606
"""Iterate over all keys within the index.
608
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
609
The former tuple is used when there are no reference lists in the
610
index, making the API compatible with simple key:value index types.
611
There is no defined order for the result iteration - it will be in
612
the most efficient order for the index.
614
if 'evil' in debug.debug_flags:
615
trace.mutter_callsite(3,
616
"iter_all_entries scales with size of history.")
617
if self._nodes is None:
619
if self.node_ref_lists:
620
for key, (value, node_ref_lists) in viewitems(self._nodes):
621
yield self, key, value, node_ref_lists
623
for key, value in viewitems(self._nodes):
624
yield self, key, value
626
def _read_prefix(self, stream):
627
signature = stream.read(len(self._signature()))
628
if not signature == self._signature():
629
raise BadIndexFormatSignature(self._name, GraphIndex)
630
options_line = stream.readline()
631
if not options_line.startswith(_OPTION_NODE_REFS):
632
raise BadIndexOptions(self)
634
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
636
raise BadIndexOptions(self)
637
options_line = stream.readline()
638
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
639
raise BadIndexOptions(self)
641
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
643
raise BadIndexOptions(self)
644
options_line = stream.readline()
645
if not options_line.startswith(_OPTION_LEN):
646
raise BadIndexOptions(self)
648
self._key_count = int(options_line[len(_OPTION_LEN):-1])
650
raise BadIndexOptions(self)
652
def _resolve_references(self, references):
653
"""Return the resolved key references for references.
655
References are resolved by looking up the location of the key in the
656
_keys_by_offset map and substituting the key name, preserving ordering.
658
:param references: An iterable of iterables of key locations. e.g.
660
:return: A tuple of tuples of keys.
663
for ref_list in references:
664
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
665
return tuple(node_refs)
668
def _find_index(range_map, key):
669
"""Helper for the _parsed_*_index calls.
671
Given a range map - [(start, end), ...], finds the index of the range
672
in the map for key if it is in the map, and if it is not there, the
673
immediately preceeding range in the map.
675
result = bisect_right(range_map, key) - 1
676
if result + 1 < len(range_map):
677
# check the border condition, it may be in result + 1
678
if range_map[result + 1][0] == key[0]:
682
def _parsed_byte_index(self, offset):
683
"""Return the index of the entry immediately before offset.
685
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
686
there is one unparsed byte (the 11th, addressed as[10]). then:
687
asking for 0 will return 0
688
asking for 10 will return 0
689
asking for 11 will return 1
690
asking for 12 will return 1
693
return self._find_index(self._parsed_byte_map, key)
695
def _parsed_key_index(self, key):
696
"""Return the index of the entry immediately before key.
698
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
699
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
700
have been parsed, then:
701
asking for '' will return 0
702
asking for 'a' will return 0
703
asking for 'b' will return 1
704
asking for 'e' will return 1
706
search_key = (key, b'')
707
return self._find_index(self._parsed_key_map, search_key)
709
def _is_parsed(self, offset):
710
"""Returns True if offset has been parsed."""
711
index = self._parsed_byte_index(offset)
712
if index == len(self._parsed_byte_map):
713
return offset < self._parsed_byte_map[index - 1][1]
714
start, end = self._parsed_byte_map[index]
715
return offset >= start and offset < end
717
def _iter_entries_from_total_buffer(self, keys):
718
"""Iterate over keys when the entire index is parsed."""
719
# Note: See the note in BTreeBuilder.iter_entries for why we don't use
720
# .intersection() here
722
keys = [key for key in keys if key in nodes]
723
if self.node_ref_lists:
725
value, node_refs = nodes[key]
726
yield self, key, value, node_refs
729
yield self, key, nodes[key]
731
def iter_entries(self, keys):
732
"""Iterate over keys within the index.
734
:param keys: An iterable providing the keys to be retrieved.
735
:return: An iterable as per iter_all_entries, but restricted to the
736
keys supplied. No additional keys will be returned, and every
737
key supplied that is in the index will be returned.
742
if self._size is None and self._nodes is None:
745
# We fit about 20 keys per minimum-read (4K), so if we are looking for
746
# more than 1/20th of the index its likely (assuming homogenous key
747
# spread) that we'll read the entire index. If we're going to do that,
748
# buffer the whole thing. A better analysis might take key spread into
749
# account - but B+Tree indices are better anyway.
750
# We could look at all data read, and use a threshold there, which will
751
# trigger on ancestry walks, but that is not yet fully mapped out.
752
if self._nodes is None and len(keys) * 20 > self.key_count():
754
if self._nodes is not None:
755
return self._iter_entries_from_total_buffer(keys)
757
return (result[1] for result in bisect_multi.bisect_multi_bytes(
758
self._lookup_keys_via_location, self._size, keys))
760
def iter_entries_prefix(self, keys):
761
"""Iterate over keys within the index using prefix matching.
763
Prefix matching is applied within the tuple of a key, not to within
764
the bytestring of each key element. e.g. if you have the keys ('foo',
765
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
766
only the former key is returned.
768
WARNING: Note that this method currently causes a full index parse
769
unconditionally (which is reasonably appropriate as it is a means for
770
thunking many small indices into one larger one and still supplies
771
iter_all_entries at the thunk layer).
773
:param keys: An iterable providing the key prefixes to be retrieved.
774
Each key prefix takes the form of a tuple the length of a key, but
775
with the last N elements 'None' rather than a regular bytestring.
776
The first element cannot be 'None'.
777
:return: An iterable as per iter_all_entries, but restricted to the
778
keys with a matching prefix to those supplied. No additional keys
779
will be returned, and every match that is in the index will be
785
# load data - also finds key lengths
786
if self._nodes is None:
788
if self._key_length == 1:
790
_sanity_check_key(self, key)
791
if self.node_ref_lists:
792
value, node_refs = self._nodes[key]
793
yield self, key, value, node_refs
795
yield self, key, self._nodes[key]
797
nodes_by_key = self._get_nodes_by_key()
798
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
801
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
802
"""See BTreeIndex._find_ancestors."""
803
# The api can be implemented as a trivial overlay on top of
804
# iter_entries, it is not an efficient implementation, but it at least
808
for index, key, value, refs in self.iter_entries(keys):
809
parent_keys = refs[ref_list_num]
811
parent_map[key] = parent_keys
812
search_keys.update(parent_keys)
813
# Figure out what, if anything, was missing
814
missing_keys.update(set(keys).difference(found_keys))
815
search_keys = search_keys.difference(parent_map)
819
"""Return an estimate of the number of keys in this index.
821
For GraphIndex the estimate is exact.
823
if self._key_count is None:
824
self._read_and_parse([_HEADER_READV])
825
return self._key_count
827
def _lookup_keys_via_location(self, location_keys):
828
"""Public interface for implementing bisection.
830
If _buffer_all has been called, then all the data for the index is in
831
memory, and this method should not be called, as it uses a separate
832
cache because it cannot pre-resolve all indices, which buffer_all does
835
:param location_keys: A list of location(byte offset), key tuples.
836
:return: A list of (location_key, result) tuples as expected by
837
breezy.bisect_multi.bisect_multi_bytes.
839
# Possible improvements:
840
# - only bisect lookup each key once
841
# - sort the keys first, and use that to reduce the bisection window
843
# this progresses in three parts:
846
# attempt to answer the question from the now in memory data.
847
# build the readv request
848
# for each location, ask for 800 bytes - much more than rows we've seen
851
for location, key in location_keys:
852
# can we answer from cache?
853
if self._bisect_nodes and key in self._bisect_nodes:
854
# We have the key parsed.
856
index = self._parsed_key_index(key)
857
if (len(self._parsed_key_map) and
858
self._parsed_key_map[index][0] <= key and
859
(self._parsed_key_map[index][1] >= key or
860
# end of the file has been parsed
861
self._parsed_byte_map[index][1] == self._size)):
862
# the key has been parsed, so no lookup is needed even if its
865
# - if we have examined this part of the file already - yes
866
index = self._parsed_byte_index(location)
867
if (len(self._parsed_byte_map) and
868
self._parsed_byte_map[index][0] <= location and
869
self._parsed_byte_map[index][1] > location):
870
# the byte region has been parsed, so no read is needed.
873
if location + length > self._size:
874
length = self._size - location
875
# todo, trim out parsed locations.
877
readv_ranges.append((location, length))
878
# read the header if needed
879
if self._bisect_nodes is None:
880
readv_ranges.append(_HEADER_READV)
881
self._read_and_parse(readv_ranges)
883
if self._nodes is not None:
884
# _read_and_parse triggered a _buffer_all because we requested the
886
for location, key in location_keys:
887
if key not in self._nodes: # not present
888
result.append(((location, key), False))
889
elif self.node_ref_lists:
890
value, refs = self._nodes[key]
891
result.append(((location, key),
892
(self, key, value, refs)))
894
result.append(((location, key),
895
(self, key, self._nodes[key])))
898
# - figure out <, >, missing, present
899
# - result present references so we can return them.
900
# keys that we cannot answer until we resolve references
901
pending_references = []
902
pending_locations = set()
903
for location, key in location_keys:
904
# can we answer from cache?
905
if key in self._bisect_nodes:
906
# the key has been parsed, so no lookup is needed
907
if self.node_ref_lists:
908
# the references may not have been all parsed.
909
value, refs = self._bisect_nodes[key]
910
wanted_locations = []
911
for ref_list in refs:
913
if ref not in self._keys_by_offset:
914
wanted_locations.append(ref)
916
pending_locations.update(wanted_locations)
917
pending_references.append((location, key))
919
result.append(((location, key), (self, key,
920
value, self._resolve_references(refs))))
922
result.append(((location, key),
923
(self, key, self._bisect_nodes[key])))
926
# has the region the key should be in, been parsed?
927
index = self._parsed_key_index(key)
928
if (self._parsed_key_map[index][0] <= key and
929
(self._parsed_key_map[index][1] >= key or
930
# end of the file has been parsed
931
self._parsed_byte_map[index][1] == self._size)):
932
result.append(((location, key), False))
934
# no, is the key above or below the probed location:
935
# get the range of the probed & parsed location
936
index = self._parsed_byte_index(location)
937
# if the key is below the start of the range, its below
938
if key < self._parsed_key_map[index][0]:
942
result.append(((location, key), direction))
944
# lookup data to resolve references
945
for location in pending_locations:
947
if location + length > self._size:
948
length = self._size - location
949
# TODO: trim out parsed locations (e.g. if the 800 is into the
950
# parsed region trim it, and dont use the adjust_for_latency
953
readv_ranges.append((location, length))
954
self._read_and_parse(readv_ranges)
955
if self._nodes is not None:
956
# The _read_and_parse triggered a _buffer_all, grab the data and
958
for location, key in pending_references:
959
value, refs = self._nodes[key]
960
result.append(((location, key), (self, key, value, refs)))
962
for location, key in pending_references:
963
# answer key references we had to look-up-late.
964
value, refs = self._bisect_nodes[key]
965
result.append(((location, key), (self, key,
966
value, self._resolve_references(refs))))
969
def _parse_header_from_bytes(self, bytes):
970
"""Parse the header from a region of bytes.
972
:param bytes: The data to parse.
973
:return: An offset, data tuple such as readv yields, for the unparsed
974
data. (which may length 0).
976
signature = bytes[0:len(self._signature())]
977
if not signature == self._signature():
978
raise BadIndexFormatSignature(self._name, GraphIndex)
979
lines = bytes[len(self._signature()):].splitlines()
980
options_line = lines[0]
981
if not options_line.startswith(_OPTION_NODE_REFS):
982
raise BadIndexOptions(self)
984
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
986
raise BadIndexOptions(self)
987
options_line = lines[1]
988
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
989
raise BadIndexOptions(self)
991
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
993
raise BadIndexOptions(self)
994
options_line = lines[2]
995
if not options_line.startswith(_OPTION_LEN):
996
raise BadIndexOptions(self)
998
self._key_count = int(options_line[len(_OPTION_LEN):])
1000
raise BadIndexOptions(self)
1001
# calculate the bytes we have processed
1002
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
1004
self._parsed_bytes(0, (), header_end, ())
1005
# setup parsing state
1006
self._expected_elements = 3 + self._key_length
1007
# raw data keyed by offset
1008
self._keys_by_offset = {}
1009
# keys with the value and node references
1010
self._bisect_nodes = {}
1011
return header_end, bytes[header_end:]
1013
def _parse_region(self, offset, data):
1014
"""Parse node data returned from a readv operation.
1016
:param offset: The byte offset the data starts at.
1017
:param data: The data to parse.
1021
end = offset + len(data)
1022
high_parsed = offset
1024
# Trivial test - if the current index's end is within the
1025
# low-matching parsed range, we're done.
1026
index = self._parsed_byte_index(high_parsed)
1027
if end < self._parsed_byte_map[index][1]:
1029
# print "[%d:%d]" % (offset, end), \
1030
# self._parsed_byte_map[index:index + 2]
1031
high_parsed, last_segment = self._parse_segment(
1032
offset, data, end, index)
1036
def _parse_segment(self, offset, data, end, index):
1037
"""Parse one segment of data.
1039
:param offset: Where 'data' begins in the file.
1040
:param data: Some data to parse a segment of.
1041
:param end: Where data ends
1042
:param index: The current index into the parsed bytes map.
1043
:return: True if the parsed segment is the last possible one in the
1045
:return: high_parsed_byte, last_segment.
1046
high_parsed_byte is the location of the highest parsed byte in this
1047
segment, last_segment is True if the parsed segment is the last
1048
possible one in the data block.
1050
# default is to use all data
1052
# accomodate overlap with data before this.
1053
if offset < self._parsed_byte_map[index][1]:
1054
# overlaps the lower parsed region
1055
# skip the parsed data
1056
trim_start = self._parsed_byte_map[index][1] - offset
1057
# don't trim the start for \n
1058
start_adjacent = True
1059
elif offset == self._parsed_byte_map[index][1]:
1060
# abuts the lower parsed region
1063
# do not trim anything
1064
start_adjacent = True
1066
# does not overlap the lower parsed region
1069
# but trim the leading \n
1070
start_adjacent = False
1071
if end == self._size:
1072
# lines up to the end of all data:
1075
# do not strip to the last \n
1078
elif index + 1 == len(self._parsed_byte_map):
1079
# at the end of the parsed data
1082
# but strip to the last \n
1083
end_adjacent = False
1085
elif end == self._parsed_byte_map[index + 1][0]:
1086
# buts up against the next parsed region
1089
# do not strip to the last \n
1092
elif end > self._parsed_byte_map[index + 1][0]:
1093
# overlaps into the next parsed region
1094
# only consider the unparsed data
1095
trim_end = self._parsed_byte_map[index + 1][0] - offset
1096
# do not strip to the last \n as we know its an entire record
1098
last_segment = end < self._parsed_byte_map[index + 1][1]
1100
# does not overlap into the next region
1103
# but strip to the last \n
1104
end_adjacent = False
1106
# now find bytes to discard if needed
1107
if not start_adjacent:
1108
# work around python bug in rfind
1109
if trim_start is None:
1110
trim_start = data.find(b'\n') + 1
1112
trim_start = data.find(b'\n', trim_start) + 1
1113
if not (trim_start != 0):
1114
raise AssertionError('no \n was present')
1115
# print 'removing start', offset, trim_start, repr(data[:trim_start])
1116
if not end_adjacent:
1117
# work around python bug in rfind
1118
if trim_end is None:
1119
trim_end = data.rfind(b'\n') + 1
1121
trim_end = data.rfind(b'\n', None, trim_end) + 1
1122
if not (trim_end != 0):
1123
raise AssertionError('no \n was present')
1124
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1125
# adjust offset and data to the parseable data.
1126
trimmed_data = data[trim_start:trim_end]
1127
if not (trimmed_data):
1128
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1129
% (trim_start, trim_end, offset, offset + len(data)))
1131
offset += trim_start
1132
# print "parsing", repr(trimmed_data)
1133
# splitlines mangles the \r delimiters.. don't use it.
1134
lines = trimmed_data.split(b'\n')
1137
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
1138
for key, value in nodes:
1139
self._bisect_nodes[key] = value
1140
self._parsed_bytes(offset, first_key,
1141
offset + len(trimmed_data), last_key)
1142
return offset + len(trimmed_data), last_segment
1144
def _parse_lines(self, lines, pos):
1151
# must be at the end
1153
if not (self._size == pos + 1):
1154
raise AssertionError("%s %s" % (self._size, pos))
1157
elements = line.split(b'\0')
1158
if len(elements) != self._expected_elements:
1159
raise BadIndexData(self)
1160
# keys are tuples. Each element is a string that may occur many
1161
# times, so we intern them to save space. AB, RC, 200807
1162
key = tuple([bytesintern(element) for element in elements[:self._key_length]])
1163
if first_key is None:
1165
absent, references, value = elements[-3:]
1167
for ref_string in references.split(b'\t'):
1168
ref_lists.append(tuple([
1169
int(ref) for ref in ref_string.split(b'\r') if ref
1171
ref_lists = tuple(ref_lists)
1172
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
1173
pos += len(line) + 1 # +1 for the \n
1176
if self.node_ref_lists:
1177
node_value = (value, ref_lists)
1180
nodes.append((key, node_value))
1181
# print "parsed ", key
1182
return first_key, key, nodes, trailers
1184
def _parsed_bytes(self, start, start_key, end, end_key):
1185
"""Mark the bytes from start to end as parsed.
1187
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
1190
:param start: The start of the parsed region.
1191
:param end: The end of the parsed region.
1193
index = self._parsed_byte_index(start)
1194
new_value = (start, end)
1195
new_key = (start_key, end_key)
1197
# first range parsed is always the beginning.
1198
self._parsed_byte_map.insert(index, new_value)
1199
self._parsed_key_map.insert(index, new_key)
1203
# extend lower region
1204
# extend higher region
1205
# combine two regions
1206
if (index + 1 < len(self._parsed_byte_map) and
1207
self._parsed_byte_map[index][1] == start and
1208
self._parsed_byte_map[index + 1][0] == end):
1209
# combine two regions
1210
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1211
self._parsed_byte_map[index + 1][1])
1212
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1213
self._parsed_key_map[index + 1][1])
1214
del self._parsed_byte_map[index + 1]
1215
del self._parsed_key_map[index + 1]
1216
elif self._parsed_byte_map[index][1] == start:
1217
# extend the lower entry
1218
self._parsed_byte_map[index] = (
1219
self._parsed_byte_map[index][0], end)
1220
self._parsed_key_map[index] = (
1221
self._parsed_key_map[index][0], end_key)
1222
elif (index + 1 < len(self._parsed_byte_map) and
1223
self._parsed_byte_map[index + 1][0] == end):
1224
# extend the higher entry
1225
self._parsed_byte_map[index + 1] = (
1226
start, self._parsed_byte_map[index + 1][1])
1227
self._parsed_key_map[index + 1] = (
1228
start_key, self._parsed_key_map[index + 1][1])
1231
self._parsed_byte_map.insert(index + 1, new_value)
1232
self._parsed_key_map.insert(index + 1, new_key)
1234
def _read_and_parse(self, readv_ranges):
1235
"""Read the ranges and parse the resulting data.
1237
:param readv_ranges: A prepared readv range list.
1239
if not readv_ranges:
1241
if self._nodes is None and self._bytes_read * 2 >= self._size:
1242
# We've already read more than 50% of the file and we are about to
1243
# request more data, just _buffer_all() and be done
1247
base_offset = self._base_offset
1248
if base_offset != 0:
1249
# Rewrite the ranges for the offset
1250
readv_ranges = [(start+base_offset, size)
1251
for start, size in readv_ranges]
1252
readv_data = self._transport.readv(self._name, readv_ranges, True,
1253
self._size + self._base_offset)
1255
for offset, data in readv_data:
1256
offset -= base_offset
1257
self._bytes_read += len(data)
1259
# transport.readv() expanded to extra data which isn't part of
1261
data = data[-offset:]
1263
if offset == 0 and len(data) == self._size:
1264
# We read the whole range, most likely because the
1265
# Transport upcast our readv ranges into one long request
1266
# for enough total data to grab the whole index.
1267
self._buffer_all(BytesIO(data))
1269
if self._bisect_nodes is None:
1270
# this must be the start
1271
if not (offset == 0):
1272
raise AssertionError()
1273
offset, data = self._parse_header_from_bytes(data)
1274
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1275
self._parse_region(offset, data)
1277
def _signature(self):
1278
"""The file signature for this index type."""
1282
"""Validate that everything in the index can be accessed."""
1283
# iter_all validates completely at the moment, so just do that.
1284
for node in self.iter_all_entries():
1288
class CombinedGraphIndex(object):
1289
"""A GraphIndex made up from smaller GraphIndices.
1291
The backing indices must implement GraphIndex, and are presumed to be
1294
Queries against the combined index will be made against the first index,
1295
and then the second and so on. The order of indices can thus influence
1296
performance significantly. For example, if one index is on local disk and a
1297
second on a remote server, the local disk index should be before the other
1300
Also, queries tend to need results from the same indices as previous
1301
queries. So the indices will be reordered after every query to put the
1302
indices that had the result(s) of that query first (while otherwise
1303
preserving the relative ordering).
1306
def __init__(self, indices, reload_func=None):
1307
"""Create a CombinedGraphIndex backed by indices.
1309
:param indices: An ordered list of indices to query for data.
1310
:param reload_func: A function to call if we find we are missing an
1311
index. Should have the form reload_func() => True/False to indicate
1312
if reloading actually changed anything.
1314
self._indices = indices
1315
self._reload_func = reload_func
1316
# Sibling indices are other CombinedGraphIndex that we should call
1317
# _move_to_front_by_name on when we auto-reorder ourself.
1318
self._sibling_indices = []
1319
# A list of names that corresponds to the instances in self._indices,
1320
# so _index_names[0] is always the name for _indices[0], etc. Sibling
1321
# indices must all use the same set of names as each other.
1322
self._index_names = [None] * len(self._indices)
1326
self.__class__.__name__,
1327
', '.join(map(repr, self._indices)))
1329
def clear_cache(self):
1330
"""See GraphIndex.clear_cache()"""
1331
for index in self._indices:
1334
def get_parent_map(self, keys):
1335
"""See graph.StackedParentsProvider.get_parent_map"""
1336
search_keys = set(keys)
1337
if _mod_revision.NULL_REVISION in search_keys:
1338
search_keys.discard(_mod_revision.NULL_REVISION)
1339
found_parents = {_mod_revision.NULL_REVISION:[]}
1342
for index, key, value, refs in self.iter_entries(search_keys):
1345
parents = (_mod_revision.NULL_REVISION,)
1346
found_parents[key] = parents
1347
return found_parents
1349
__contains__ = _has_key_from_parent_map
1351
def insert_index(self, pos, index, name=None):
1352
"""Insert a new index in the list of indices to query.
1354
:param pos: The position to insert the index.
1355
:param index: The index to insert.
1356
:param name: a name for this index, e.g. a pack name. These names can
1357
be used to reflect index reorderings to related CombinedGraphIndex
1358
instances that use the same names. (see set_sibling_indices)
1360
self._indices.insert(pos, index)
1361
self._index_names.insert(pos, name)
1363
def iter_all_entries(self):
1364
"""Iterate over all keys within the index
1366
Duplicate keys across child indices are presumed to have the same
1367
value and are only reported once.
1369
:return: An iterable of (index, key, reference_lists, value).
1370
There is no defined order for the result iteration - it will be in
1371
the most efficient order for the index.
1376
for index in self._indices:
1377
for node in index.iter_all_entries():
1378
if node[1] not in seen_keys:
1380
seen_keys.add(node[1])
1382
except errors.NoSuchFile as e:
1383
if not self._try_reload(e):
1386
def iter_entries(self, keys):
1387
"""Iterate over keys within the index.
1389
Duplicate keys across child indices are presumed to have the same
1390
value and are only reported once.
1392
:param keys: An iterable providing the keys to be retrieved.
1393
:return: An iterable of (index, key, reference_lists, value). There is
1394
no defined order for the result iteration - it will be in the most
1395
efficient order for the index.
1401
for index in self._indices:
1405
for node in index.iter_entries(keys):
1406
keys.remove(node[1])
1410
hit_indices.append(index)
1412
except errors.NoSuchFile as e:
1413
if not self._try_reload(e):
1415
self._move_to_front(hit_indices)
1417
def iter_entries_prefix(self, keys):
1418
"""Iterate over keys within the index using prefix matching.
1420
Duplicate keys across child indices are presumed to have the same
1421
value and are only reported once.
1423
Prefix matching is applied within the tuple of a key, not to within
1424
the bytestring of each key element. e.g. if you have the keys ('foo',
1425
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1426
only the former key is returned.
1428
:param keys: An iterable providing the key prefixes to be retrieved.
1429
Each key prefix takes the form of a tuple the length of a key, but
1430
with the last N elements 'None' rather than a regular bytestring.
1431
The first element cannot be 'None'.
1432
:return: An iterable as per iter_all_entries, but restricted to the
1433
keys with a matching prefix to those supplied. No additional keys
1434
will be returned, and every match that is in the index will be
1444
for index in self._indices:
1446
for node in index.iter_entries_prefix(keys):
1447
if node[1] in seen_keys:
1449
seen_keys.add(node[1])
1453
hit_indices.append(index)
1455
except errors.NoSuchFile as e:
1456
if not self._try_reload(e):
1458
self._move_to_front(hit_indices)
1460
def _move_to_front(self, hit_indices):
1461
"""Rearrange self._indices so that hit_indices are first.
1463
Order is maintained as much as possible, e.g. the first unhit index
1464
will be the first index in _indices after the hit_indices, and the
1465
hit_indices will be present in exactly the order they are passed to
1468
_move_to_front propagates to all objects in self._sibling_indices by
1469
calling _move_to_front_by_name.
1471
if self._indices[:len(hit_indices)] == hit_indices:
1472
# The 'hit_indices' are already at the front (and in the same
1473
# order), no need to re-order
1475
hit_names = self._move_to_front_by_index(hit_indices)
1476
for sibling_idx in self._sibling_indices:
1477
sibling_idx._move_to_front_by_name(hit_names)
1479
def _move_to_front_by_index(self, hit_indices):
1480
"""Core logic for _move_to_front.
1482
Returns a list of names corresponding to the hit_indices param.
1484
indices_info = zip(self._index_names, self._indices)
1485
if 'index' in debug.debug_flags:
1486
indices_info = list(indices_info)
1487
trace.mutter('CombinedGraphIndex reordering: currently %r, '
1488
'promoting %r', indices_info, hit_indices)
1491
new_hit_indices = []
1494
for offset, (name, idx) in enumerate(indices_info):
1495
if idx in hit_indices:
1496
hit_names.append(name)
1497
new_hit_indices.append(idx)
1498
if len(new_hit_indices) == len(hit_indices):
1499
# We've found all of the hit entries, everything else is
1501
unhit_names.extend(self._index_names[offset+1:])
1502
unhit_indices.extend(self._indices[offset+1:])
1505
unhit_names.append(name)
1506
unhit_indices.append(idx)
1508
self._indices = new_hit_indices + unhit_indices
1509
self._index_names = hit_names + unhit_names
1510
if 'index' in debug.debug_flags:
1511
trace.mutter('CombinedGraphIndex reordered: %r', self._indices)
1514
def _move_to_front_by_name(self, hit_names):
1515
"""Moves indices named by 'hit_names' to front of the search order, as
1516
described in _move_to_front.
1518
# Translate names to index instances, and then call
1519
# _move_to_front_by_index.
1520
indices_info = zip(self._index_names, self._indices)
1522
for name, idx in indices_info:
1523
if name in hit_names:
1524
hit_indices.append(idx)
1525
self._move_to_front_by_index(hit_indices)
1527
def find_ancestry(self, keys, ref_list_num):
1528
"""Find the complete ancestry for the given set of keys.
1530
Note that this is a whole-ancestry request, so it should be used
1533
:param keys: An iterable of keys to look for
1534
:param ref_list_num: The reference list which references the parents
1536
:return: (parent_map, missing_keys)
1538
# XXX: make this call _move_to_front?
1539
missing_keys = set()
1541
keys_to_lookup = set(keys)
1543
while keys_to_lookup:
1544
# keys that *all* indexes claim are missing, stop searching them
1546
all_index_missing = None
1547
# print 'gen\tidx\tsub\tn_keys\tn_pmap\tn_miss'
1548
# print '%4d\t\t\t%4d\t%5d\t%5d' % (generation, len(keys_to_lookup),
1550
# len(missing_keys))
1551
for index_idx, index in enumerate(self._indices):
1552
# TODO: we should probably be doing something with
1553
# 'missing_keys' since we've already determined that
1554
# those revisions have not been found anywhere
1555
index_missing_keys = set()
1556
# Find all of the ancestry we can from this index
1557
# keep looking until the search_keys set is empty, which means
1558
# things we didn't find should be in index_missing_keys
1559
search_keys = keys_to_lookup
1561
# print ' \t%2d\t\t%4d\t%5d\t%5d' % (
1562
# index_idx, len(search_keys),
1563
# len(parent_map), len(index_missing_keys))
1566
# TODO: ref_list_num should really be a parameter, since
1567
# CombinedGraphIndex does not know what the ref lists
1569
search_keys = index._find_ancestors(search_keys,
1570
ref_list_num, parent_map, index_missing_keys)
1571
# print ' \t \t%2d\t%4d\t%5d\t%5d' % (
1572
# sub_generation, len(search_keys),
1573
# len(parent_map), len(index_missing_keys))
1574
# Now set whatever was missing to be searched in the next index
1575
keys_to_lookup = index_missing_keys
1576
if all_index_missing is None:
1577
all_index_missing = set(index_missing_keys)
1579
all_index_missing.intersection_update(index_missing_keys)
1580
if not keys_to_lookup:
1582
if all_index_missing is None:
1583
# There were no indexes, so all search keys are 'missing'
1584
missing_keys.update(keys_to_lookup)
1585
keys_to_lookup = None
1587
missing_keys.update(all_index_missing)
1588
keys_to_lookup.difference_update(all_index_missing)
1589
return parent_map, missing_keys
1591
def key_count(self):
1592
"""Return an estimate of the number of keys in this index.
1594
For CombinedGraphIndex this is approximated by the sum of the keys of
1595
the child indices. As child indices may have duplicate keys this can
1596
have a maximum error of the number of child indices * largest number of
1601
return sum((index.key_count() for index in self._indices), 0)
1602
except errors.NoSuchFile as e:
1603
if not self._try_reload(e):
1606
missing_keys = _missing_keys_from_parent_map
1608
def _try_reload(self, error):
1609
"""We just got a NoSuchFile exception.
1611
Try to reload the indices, if it fails, just raise the current
1614
if self._reload_func is None:
1616
trace.mutter('Trying to reload after getting exception: %s', str(error))
1617
if not self._reload_func():
1618
# We tried to reload, but nothing changed, so we fail anyway
1619
trace.mutter('_reload_func indicated nothing has changed.'
1620
' Raising original exception.')
1624
def set_sibling_indices(self, sibling_combined_graph_indices):
1625
"""Set the CombinedGraphIndex objects to reorder after reordering self.
1627
self._sibling_indices = sibling_combined_graph_indices
1630
"""Validate that everything in the index can be accessed."""
1633
for index in self._indices:
1636
except errors.NoSuchFile as e:
1637
if not self._try_reload(e):
1641
class InMemoryGraphIndex(GraphIndexBuilder):
1642
"""A GraphIndex which operates entirely out of memory and is mutable.
1644
This is designed to allow the accumulation of GraphIndex entries during a
1645
single write operation, where the accumulated entries need to be immediately
1646
available - for example via a CombinedGraphIndex.
1649
def add_nodes(self, nodes):
1650
"""Add nodes to the index.
1652
:param nodes: An iterable of (key, node_refs, value) entries to add.
1654
if self.reference_lists:
1655
for (key, value, node_refs) in nodes:
1656
self.add_node(key, value, node_refs)
1658
for (key, value) in nodes:
1659
self.add_node(key, value)
1661
def iter_all_entries(self):
1662
"""Iterate over all keys within the index
1664
:return: An iterable of (index, key, reference_lists, value). There is no
1665
defined order for the result iteration - it will be in the most
1666
efficient order for the index (in this case dictionary hash order).
1668
if 'evil' in debug.debug_flags:
1669
trace.mutter_callsite(3,
1670
"iter_all_entries scales with size of history.")
1671
if self.reference_lists:
1672
for key, (absent, references, value) in viewitems(self._nodes):
1674
yield self, key, value, references
1676
for key, (absent, references, value) in viewitems(self._nodes):
1678
yield self, key, value
1680
def iter_entries(self, keys):
1681
"""Iterate over keys within the index.
1683
:param keys: An iterable providing the keys to be retrieved.
1684
:return: An iterable of (index, key, value, reference_lists). There is no
1685
defined order for the result iteration - it will be in the most
1686
efficient order for the index (keys iteration order in this case).
1688
# Note: See BTreeBuilder.iter_entries for an explanation of why we
1689
# aren't using set().intersection() here
1691
keys = [key for key in keys if key in nodes]
1692
if self.reference_lists:
1696
yield self, key, node[2], node[1]
1701
yield self, key, node[2]
1703
def iter_entries_prefix(self, keys):
1704
"""Iterate over keys within the index using prefix matching.
1706
Prefix matching is applied within the tuple of a key, not to within
1707
the bytestring of each key element. e.g. if you have the keys ('foo',
1708
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1709
only the former key is returned.
1711
:param keys: An iterable providing the key prefixes to be retrieved.
1712
Each key prefix takes the form of a tuple the length of a key, but
1713
with the last N elements 'None' rather than a regular bytestring.
1714
The first element cannot be 'None'.
1715
:return: An iterable as per iter_all_entries, but restricted to the
1716
keys with a matching prefix to those supplied. No additional keys
1717
will be returned, and every match that is in the index will be
1723
if self._key_length == 1:
1725
_sanity_check_key(self, key)
1726
node = self._nodes[key]
1729
if self.reference_lists:
1730
yield self, key, node[2], node[1]
1732
yield self, key, node[2]
1734
nodes_by_key = self._get_nodes_by_key()
1735
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
1738
def key_count(self):
1739
"""Return an estimate of the number of keys in this index.
1741
For InMemoryGraphIndex the estimate is exact.
1743
return len(self._nodes) - len(self._absent_keys)
1746
"""In memory index's have no known corruption at the moment."""
1748
def __lt__(self, other):
1749
# We don't really care about the order, just that there is an order.
1750
if (not isinstance(other, GraphIndex) and
1751
not isinstance(other, InMemoryGraphIndex)):
1752
raise TypeError(other)
1753
return hash(self) < hash(other)
1756
class GraphIndexPrefixAdapter(object):
1757
"""An adapter between GraphIndex with different key lengths.
1759
Queries against this will emit queries against the adapted Graph with the
1760
prefix added, queries for all items use iter_entries_prefix. The returned
1761
nodes will have their keys and node references adjusted to remove the
1762
prefix. Finally, an add_nodes_callback can be supplied - when called the
1763
nodes and references being added will have prefix prepended.
1766
def __init__(self, adapted, prefix, missing_key_length,
1767
add_nodes_callback=None):
1768
"""Construct an adapter against adapted with prefix."""
1769
self.adapted = adapted
1770
self.prefix_key = prefix + (None,)*missing_key_length
1771
self.prefix = prefix
1772
self.prefix_len = len(prefix)
1773
self.add_nodes_callback = add_nodes_callback
1775
def add_nodes(self, nodes):
1776
"""Add nodes to the index.
1778
:param nodes: An iterable of (key, node_refs, value) entries to add.
1780
# save nodes in case its an iterator
1781
nodes = tuple(nodes)
1782
translated_nodes = []
1784
# Add prefix_key to each reference node_refs is a tuple of tuples,
1785
# so split it apart, and add prefix_key to the internal reference
1786
for (key, value, node_refs) in nodes:
1787
adjusted_references = (
1788
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1789
for ref_list in node_refs))
1790
translated_nodes.append((self.prefix + key, value,
1791
adjusted_references))
1793
# XXX: TODO add an explicit interface for getting the reference list
1794
# status, to handle this bit of user-friendliness in the API more
1796
for (key, value) in nodes:
1797
translated_nodes.append((self.prefix + key, value))
1798
self.add_nodes_callback(translated_nodes)
1800
def add_node(self, key, value, references=()):
1801
"""Add a node to the index.
1803
:param key: The key. keys are non-empty tuples containing
1804
as many whitespace-free utf8 bytestrings as the key length
1805
defined for this index.
1806
:param references: An iterable of iterables of keys. Each is a
1807
reference to another key.
1808
:param value: The value to associate with the key. It may be any
1809
bytes as long as it does not contain \0 or \n.
1811
self.add_nodes(((key, value, references), ))
1813
def _strip_prefix(self, an_iter):
1814
"""Strip prefix data from nodes and return it."""
1815
for node in an_iter:
1817
if node[1][:self.prefix_len] != self.prefix:
1818
raise BadIndexData(self)
1819
for ref_list in node[3]:
1820
for ref_node in ref_list:
1821
if ref_node[:self.prefix_len] != self.prefix:
1822
raise BadIndexData(self)
1823
yield node[0], node[1][self.prefix_len:], node[2], (
1824
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1825
for ref_list in node[3]))
1827
def iter_all_entries(self):
1828
"""Iterate over all keys within the index
1830
iter_all_entries is implemented against the adapted index using
1831
iter_entries_prefix.
1833
:return: An iterable of (index, key, reference_lists, value). There is no
1834
defined order for the result iteration - it will be in the most
1835
efficient order for the index (in this case dictionary hash order).
1837
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1839
def iter_entries(self, keys):
1840
"""Iterate over keys within the index.
1842
:param keys: An iterable providing the keys to be retrieved.
1843
:return: An iterable of (index, key, value, reference_lists). There is no
1844
defined order for the result iteration - it will be in the most
1845
efficient order for the index (keys iteration order in this case).
1847
return self._strip_prefix(self.adapted.iter_entries(
1848
self.prefix + key for key in keys))
1850
def iter_entries_prefix(self, keys):
1851
"""Iterate over keys within the index using prefix matching.
1853
Prefix matching is applied within the tuple of a key, not to within
1854
the bytestring of each key element. e.g. if you have the keys ('foo',
1855
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1856
only the former key is returned.
1858
:param keys: An iterable providing the key prefixes to be retrieved.
1859
Each key prefix takes the form of a tuple the length of a key, but
1860
with the last N elements 'None' rather than a regular bytestring.
1861
The first element cannot be 'None'.
1862
:return: An iterable as per iter_all_entries, but restricted to the
1863
keys with a matching prefix to those supplied. No additional keys
1864
will be returned, and every match that is in the index will be
1867
return self._strip_prefix(self.adapted.iter_entries_prefix(
1868
self.prefix + key for key in keys))
1870
def key_count(self):
1871
"""Return an estimate of the number of keys in this index.
1873
For GraphIndexPrefixAdapter this is relatively expensive - key
1874
iteration with the prefix is done.
1876
return len(list(self.iter_all_entries()))
1879
"""Call the adapted's validate."""
1880
self.adapted.validate()
1883
def _sanity_check_key(index_or_builder, key):
1884
"""Raise BadIndexKey if key cannot be used for prefix matching."""
1886
raise BadIndexKey(key)
1887
if len(key) != index_or_builder._key_length:
1888
raise BadIndexKey(key)
1891
def _iter_entries_prefix(index_or_builder, nodes_by_key, keys):
1892
"""Helper for implementing prefix matching iterators."""
1894
_sanity_check_key(index_or_builder, key)
1895
# find what it refers to:
1896
key_dict = nodes_by_key
1897
elements = list(key)
1898
# find the subdict whose contents should be returned.
1900
while len(elements) and elements[0] is not None:
1901
key_dict = key_dict[elements[0]]
1904
# a non-existant lookup.
1909
values_view = viewvalues(dicts.pop())
1910
# can't be empty or would not exist
1911
value = next(iter(values_view))
1912
if isinstance(value, dict):
1913
# still descending, push values
1914
dicts.extend(values_view)
1916
# at leaf tuples, yield values
1917
for value in values_view:
1918
# each value is the key:value:node refs tuple
1920
yield (index_or_builder, ) + value
1922
# the last thing looked up was a terminal element
1923
yield (index_or_builder, ) + key_dict