1
# Copyright (C) 2007-2011 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Indexing facilities."""
19
from __future__ import absolute_import
25
'GraphIndexPrefixAdapter',
29
from bisect import bisect_right
32
from ..lazy_import import lazy_import
33
lazy_import(globals(), """
36
revision as _mod_revision,
44
from ..sixish import (
51
from ..static_tuple import StaticTuple
53
_HEADER_READV = (0, 200)
54
_OPTION_KEY_ELEMENTS = b"key_elements="
56
_OPTION_NODE_REFS = b"node_ref_lists="
57
_SIGNATURE = b"Bazaar Graph Index 1\n"
60
class BadIndexFormatSignature(errors.BzrError):
62
_fmt = "%(value)s is not an index of type %(_type)s."
64
def __init__(self, value, _type):
65
errors.BzrError.__init__(self)
70
class BadIndexData(errors.BzrError):
72
_fmt = "Error in data for index %(value)s."
74
def __init__(self, value):
75
errors.BzrError.__init__(self)
79
class BadIndexDuplicateKey(errors.BzrError):
81
_fmt = "The key '%(key)s' is already in index '%(index)s'."
83
def __init__(self, key, index):
84
errors.BzrError.__init__(self)
89
class BadIndexKey(errors.BzrError):
91
_fmt = "The key '%(key)s' is not a valid key."
93
def __init__(self, key):
94
errors.BzrError.__init__(self)
98
class BadIndexOptions(errors.BzrError):
100
_fmt = "Could not parse options for index %(value)s."
102
def __init__(self, value):
103
errors.BzrError.__init__(self)
107
class BadIndexValue(errors.BzrError):
109
_fmt = "The value '%(value)s' is not a valid value."
111
def __init__(self, value):
112
errors.BzrError.__init__(self)
116
_whitespace_re = re.compile(b'[\t\n\x0b\x0c\r\x00 ]')
117
_newline_null_re = re.compile(b'[\n\0]')
120
def _has_key_from_parent_map(self, key):
121
"""Check if this index has one key.
123
If it's possible to check for multiple keys at once through
124
calling get_parent_map that should be faster.
126
return (key in self.get_parent_map([key]))
129
def _missing_keys_from_parent_map(self, keys):
130
return set(keys) - set(self.get_parent_map(keys))
133
class GraphIndexBuilder(object):
134
"""A builder that can build a GraphIndex.
136
The resulting graph has the structure::
138
_SIGNATURE OPTIONS NODES NEWLINE
139
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
140
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
142
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
143
KEY := Not-whitespace-utf8
145
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
146
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
147
REFERENCE := DIGITS ; digits is the byte offset in the index of the
149
VALUE := no-newline-no-null-bytes
152
def __init__(self, reference_lists=0, key_elements=1):
153
"""Create a GraphIndex builder.
155
:param reference_lists: The number of node references lists for each
157
:param key_elements: The number of bytestrings in each key.
159
self.reference_lists = reference_lists
160
# A dict of {key: (absent, ref_lists, value)}
162
# Keys that are referenced but not actually present in this index
163
self._absent_keys = set()
164
self._nodes_by_key = None
165
self._key_length = key_elements
166
self._optimize_for_size = False
167
self._combine_backing_indices = True
169
def _check_key(self, key):
170
"""Raise BadIndexKey if key is not a valid key for this index."""
171
if type(key) not in (tuple, StaticTuple):
172
raise BadIndexKey(key)
173
if self._key_length != len(key):
174
raise BadIndexKey(key)
176
if not element or type(element) != bytes or _whitespace_re.search(element) is not None:
177
raise BadIndexKey(key)
179
def _external_references(self):
180
"""Return references that are not present in this index.
184
# TODO: JAM 2008-11-21 This makes an assumption about how the reference
185
# lists are used. It is currently correct for pack-0.92 through
186
# 1.9, which use the node references (3rd column) second
187
# reference list as the compression parent. Perhaps this should
188
# be moved into something higher up the stack, since it
189
# makes assumptions about how the index is used.
190
if self.reference_lists > 1:
191
for node in self.iter_all_entries():
193
refs.update(node[3][1])
196
# If reference_lists == 0 there can be no external references, and
197
# if reference_lists == 1, then there isn't a place to store the
201
def _get_nodes_by_key(self):
202
if self._nodes_by_key is None:
204
if self.reference_lists:
205
for key, (absent, references, value) in viewitems(self._nodes):
208
key_dict = nodes_by_key
209
for subkey in key[:-1]:
210
key_dict = key_dict.setdefault(subkey, {})
211
key_dict[key[-1]] = key, value, references
213
for key, (absent, references, value) in viewitems(self._nodes):
216
key_dict = nodes_by_key
217
for subkey in key[:-1]:
218
key_dict = key_dict.setdefault(subkey, {})
219
key_dict[key[-1]] = key, value
220
self._nodes_by_key = nodes_by_key
221
return self._nodes_by_key
223
def _update_nodes_by_key(self, key, value, node_refs):
224
"""Update the _nodes_by_key dict with a new key.
226
For a key of (foo, bar, baz) create
227
_nodes_by_key[foo][bar][baz] = key_value
229
if self._nodes_by_key is None:
231
key_dict = self._nodes_by_key
232
if self.reference_lists:
233
key_value = StaticTuple(key, value, node_refs)
235
key_value = StaticTuple(key, value)
236
for subkey in key[:-1]:
237
key_dict = key_dict.setdefault(subkey, {})
238
key_dict[key[-1]] = key_value
240
def _check_key_ref_value(self, key, references, value):
241
"""Check that 'key' and 'references' are all valid.
243
:param key: A key tuple. Must conform to the key interface (be a tuple,
244
be of the right length, not have any whitespace or nulls in any key
246
:param references: An iterable of reference lists. Something like
247
[[(ref, key)], [(ref, key), (other, key)]]
248
:param value: The value associate with this key. Must not contain
249
newlines or null characters.
250
:return: (node_refs, absent_references)
252
* node_refs: basically a packed form of 'references' where all
254
* absent_references: reference keys that are not in self._nodes.
255
This may contain duplicates if the same key is referenced in
258
as_st = StaticTuple.from_sequence
260
if _newline_null_re.search(value) is not None:
261
raise BadIndexValue(value)
262
if len(references) != self.reference_lists:
263
raise BadIndexValue(references)
265
absent_references = []
266
for reference_list in references:
267
for reference in reference_list:
268
# If reference *is* in self._nodes, then we know it has already
270
if reference not in self._nodes:
271
self._check_key(reference)
272
absent_references.append(reference)
273
reference_list = as_st([as_st(ref).intern()
274
for ref in reference_list])
275
node_refs.append(reference_list)
276
return as_st(node_refs), absent_references
278
def add_node(self, key, value, references=()):
279
"""Add a node to the index.
281
:param key: The key. keys are non-empty tuples containing
282
as many whitespace-free utf8 bytestrings as the key length
283
defined for this index.
284
:param references: An iterable of iterables of keys. Each is a
285
reference to another key.
286
:param value: The value to associate with the key. It may be any
287
bytes as long as it does not contain \\0 or \\n.
290
absent_references) = self._check_key_ref_value(key, references, value)
291
if key in self._nodes and self._nodes[key][0] != b'a':
292
raise BadIndexDuplicateKey(key, self)
293
for reference in absent_references:
294
# There may be duplicates, but I don't think it is worth worrying
296
self._nodes[reference] = (b'a', (), b'')
297
self._absent_keys.update(absent_references)
298
self._absent_keys.discard(key)
299
self._nodes[key] = (b'', node_refs, value)
300
if self._nodes_by_key is not None and self._key_length > 1:
301
self._update_nodes_by_key(key, value, node_refs)
303
def clear_cache(self):
304
"""See GraphIndex.clear_cache()
306
This is a no-op, but we need the api to conform to a generic 'Index'
313
:returns: cBytesIO holding the full context of the index as it
314
should be written to disk.
317
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
318
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
319
key_count = len(self._nodes) - len(self._absent_keys)
320
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
321
prefix_length = sum(len(x) for x in lines)
322
# references are byte offsets. To avoid having to do nasty
323
# polynomial work to resolve offsets (references to later in the
324
# file cannot be determined until all the inbetween references have
325
# been calculated too) we pad the offsets with 0's to make them be
326
# of consistent length. Using binary offsets would break the trivial
328
# to calculate the width of zero's needed we do three passes:
329
# one to gather all the non-reference data and the number of references.
330
# one to pad all the data with reference-length and determine entry
334
# forward sorted by key. In future we may consider topological sorting,
335
# at the cost of table scans for direct lookup, or a second index for
337
nodes = sorted(viewitems(self._nodes))
338
# if we do not prepass, we don't know how long it will be up front.
339
expected_bytes = None
340
# we only need to pre-pass if we have reference lists at all.
341
if self.reference_lists:
343
non_ref_bytes = prefix_length
345
# TODO use simple multiplication for the constants in this loop.
346
for key, (absent, references, value) in nodes:
347
# record the offset known *so far* for this key:
348
# the non reference bytes to date, and the total references to
349
# date - saves reaccumulating on the second pass
350
key_offset_info.append((key, non_ref_bytes, total_references))
351
# key is literal, value is literal, there are 3 null's, 1 NL
352
# key is variable length tuple, \x00 between elements
353
non_ref_bytes += sum(len(element) for element in key)
354
if self._key_length > 1:
355
non_ref_bytes += self._key_length - 1
356
# value is literal bytes, there are 3 null's, 1 NL.
357
non_ref_bytes += len(value) + 3 + 1
358
# one byte for absent if set.
361
elif self.reference_lists:
362
# (ref_lists -1) tabs
363
non_ref_bytes += self.reference_lists - 1
364
# (ref-1 cr's per ref_list)
365
for ref_list in references:
366
# how many references across the whole file?
367
total_references += len(ref_list)
368
# accrue reference separators
370
non_ref_bytes += len(ref_list) - 1
371
# how many digits are needed to represent the total byte count?
373
possible_total_bytes = non_ref_bytes + total_references * digits
374
while 10 ** digits < possible_total_bytes:
376
possible_total_bytes = non_ref_bytes + total_references * digits
377
expected_bytes = possible_total_bytes + 1 # terminating newline
378
# resolve key addresses.
380
for key, non_ref_bytes, total_references in key_offset_info:
381
key_addresses[key] = non_ref_bytes + total_references * digits
383
format_string = b'%%0%dd' % digits
384
for key, (absent, references, value) in nodes:
385
flattened_references = []
386
for ref_list in references:
388
for reference in ref_list:
389
ref_addresses.append(format_string %
390
key_addresses[reference])
391
flattened_references.append(b'\r'.join(ref_addresses))
392
string_key = b'\x00'.join(key)
393
lines.append(b"%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
394
b'\t'.join(flattened_references), value))
396
result = BytesIO(b''.join(lines))
397
if expected_bytes and len(result.getvalue()) != expected_bytes:
398
raise errors.BzrError('Failed index creation. Internal error:'
399
' mismatched output length and expected length: %d %d' %
400
(len(result.getvalue()), expected_bytes))
403
def set_optimize(self, for_size=None, combine_backing_indices=None):
404
"""Change how the builder tries to optimize the result.
406
:param for_size: Tell the builder to try and make the index as small as
408
:param combine_backing_indices: If the builder spills to disk to save
409
memory, should the on-disk indices be combined. Set to True if you
410
are going to be probing the index, but to False if you are not. (If
411
you are not querying, then the time spent combining is wasted.)
414
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
416
if for_size is not None:
417
self._optimize_for_size = for_size
418
if combine_backing_indices is not None:
419
self._combine_backing_indices = combine_backing_indices
421
def find_ancestry(self, keys, ref_list_num):
422
"""See CombinedGraphIndex.find_ancestry()"""
428
for _, key, value, ref_lists in self.iter_entries(pending):
429
parent_keys = ref_lists[ref_list_num]
430
parent_map[key] = parent_keys
431
next_pending.update([p for p in parent_keys if p not in
433
missing_keys.update(pending.difference(parent_map))
434
pending = next_pending
435
return parent_map, missing_keys
438
class GraphIndex(object):
439
"""An index for data with embedded graphs.
441
The index maps keys to a list of key reference lists, and a value.
442
Each node has the same number of key reference lists. Each key reference
443
list can be empty or an arbitrary length. The value is an opaque NULL
444
terminated string without any newlines. The storage of the index is
445
hidden in the interface: keys and key references are always tuples of
446
bytestrings, never the internal representation (e.g. dictionary offsets).
448
It is presumed that the index will not be mutated - it is static data.
450
Successive iter_all_entries calls will read the entire index each time.
451
Additionally, iter_entries calls will read the index linearly until the
452
desired keys are found. XXX: This must be fixed before the index is
453
suitable for production use. :XXX
456
def __init__(self, transport, name, size, unlimited_cache=False, offset=0):
457
"""Open an index called name on transport.
459
:param transport: A breezy.transport.Transport.
460
:param name: A path to provide to transport API calls.
461
:param size: The size of the index in bytes. This is used for bisection
462
logic to perform partial index reads. While the size could be
463
obtained by statting the file this introduced an additional round
464
trip as well as requiring stat'able transports, both of which are
465
avoided by having it supplied. If size is None, then bisection
466
support will be disabled and accessing the index will just stream
468
:param offset: Instead of starting the index data at offset 0, start it
469
at an arbitrary offset.
471
self._transport = transport
473
# Becomes a dict of key:(value, reference-list-byte-locations) used by
474
# the bisection interface to store parsed but not resolved keys.
475
self._bisect_nodes = None
476
# Becomes a dict of key:(value, reference-list-keys) which are ready to
477
# be returned directly to callers.
479
# a sorted list of slice-addresses for the parsed bytes of the file.
480
# e.g. (0,1) would mean that byte 0 is parsed.
481
self._parsed_byte_map = []
482
# a sorted list of keys matching each slice address for parsed bytes
483
# e.g. (None, 'foo@bar') would mean that the first byte contained no
484
# key, and the end byte of the slice is the of the data for 'foo@bar'
485
self._parsed_key_map = []
486
self._key_count = None
487
self._keys_by_offset = None
488
self._nodes_by_key = None
490
# The number of bytes we've read so far in trying to process this file
492
self._base_offset = offset
494
def __eq__(self, other):
495
"""Equal when self and other were created with the same parameters."""
497
isinstance(self, type(other)) and
498
self._transport == other._transport and
499
self._name == other._name and
500
self._size == other._size)
502
def __ne__(self, other):
503
return not self.__eq__(other)
505
def __lt__(self, other):
506
# We don't really care about the order, just that there is an order.
507
if (not isinstance(other, GraphIndex) and
508
not isinstance(other, InMemoryGraphIndex)):
509
raise TypeError(other)
510
return hash(self) < hash(other)
513
return hash((type(self), self._transport, self._name, self._size))
516
return "%s(%r)" % (self.__class__.__name__,
517
self._transport.abspath(self._name))
519
def _buffer_all(self, stream=None):
520
"""Buffer all the index data.
522
Mutates self._nodes and self.keys_by_offset.
524
if self._nodes is not None:
525
# We already did this
527
if 'index' in debug.debug_flags:
528
trace.mutter('Reading entire index %s',
529
self._transport.abspath(self._name))
531
stream = self._transport.get(self._name)
532
if self._base_offset != 0:
533
# This is wasteful, but it is better than dealing with
534
# adjusting all the offsets, etc.
535
stream = BytesIO(stream.read()[self._base_offset:])
537
self._read_prefix(stream)
538
self._expected_elements = 3 + self._key_length
540
# raw data keyed by offset
541
self._keys_by_offset = {}
542
# ready-to-return key:value or key:value, node_ref_lists
544
self._nodes_by_key = None
547
lines = stream.read().split(b'\n')
551
_, _, _, trailers = self._parse_lines(lines, pos)
552
for key, absent, references, value in viewvalues(self._keys_by_offset):
555
# resolve references:
556
if self.node_ref_lists:
557
node_value = (value, self._resolve_references(references))
560
self._nodes[key] = node_value
561
# cache the keys for quick set intersections
563
# there must be one line - the empty trailer line.
564
raise BadIndexData(self)
566
def clear_cache(self):
567
"""Clear out any cached/memoized values.
569
This can be called at any time, but generally it is used when we have
570
extracted some information, but don't expect to be requesting any more
574
def external_references(self, ref_list_num):
575
"""Return references that are not present in this index.
578
if ref_list_num + 1 > self.node_ref_lists:
579
raise ValueError('No ref list %d, index has %d ref lists'
580
% (ref_list_num, self.node_ref_lists))
583
for key, (value, ref_lists) in viewitems(nodes):
584
ref_list = ref_lists[ref_list_num]
585
refs.update([ref for ref in ref_list if ref not in nodes])
588
def _get_nodes_by_key(self):
589
if self._nodes_by_key is None:
591
if self.node_ref_lists:
592
for key, (value, references) in viewitems(self._nodes):
593
key_dict = nodes_by_key
594
for subkey in key[:-1]:
595
key_dict = key_dict.setdefault(subkey, {})
596
key_dict[key[-1]] = key, value, references
598
for key, value in viewitems(self._nodes):
599
key_dict = nodes_by_key
600
for subkey in key[:-1]:
601
key_dict = key_dict.setdefault(subkey, {})
602
key_dict[key[-1]] = key, value
603
self._nodes_by_key = nodes_by_key
604
return self._nodes_by_key
606
def iter_all_entries(self):
607
"""Iterate over all keys within the index.
609
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
610
The former tuple is used when there are no reference lists in the
611
index, making the API compatible with simple key:value index types.
612
There is no defined order for the result iteration - it will be in
613
the most efficient order for the index.
615
if 'evil' in debug.debug_flags:
616
trace.mutter_callsite(3,
617
"iter_all_entries scales with size of history.")
618
if self._nodes is None:
620
if self.node_ref_lists:
621
for key, (value, node_ref_lists) in viewitems(self._nodes):
622
yield self, key, value, node_ref_lists
624
for key, value in viewitems(self._nodes):
625
yield self, key, value
627
def _read_prefix(self, stream):
628
signature = stream.read(len(self._signature()))
629
if not signature == self._signature():
630
raise BadIndexFormatSignature(self._name, GraphIndex)
631
options_line = stream.readline()
632
if not options_line.startswith(_OPTION_NODE_REFS):
633
raise BadIndexOptions(self)
635
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
637
raise BadIndexOptions(self)
638
options_line = stream.readline()
639
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
640
raise BadIndexOptions(self)
642
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
644
raise BadIndexOptions(self)
645
options_line = stream.readline()
646
if not options_line.startswith(_OPTION_LEN):
647
raise BadIndexOptions(self)
649
self._key_count = int(options_line[len(_OPTION_LEN):-1])
651
raise BadIndexOptions(self)
653
def _resolve_references(self, references):
654
"""Return the resolved key references for references.
656
References are resolved by looking up the location of the key in the
657
_keys_by_offset map and substituting the key name, preserving ordering.
659
:param references: An iterable of iterables of key locations. e.g.
661
:return: A tuple of tuples of keys.
664
for ref_list in references:
666
tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
667
return tuple(node_refs)
670
def _find_index(range_map, key):
671
"""Helper for the _parsed_*_index calls.
673
Given a range map - [(start, end), ...], finds the index of the range
674
in the map for key if it is in the map, and if it is not there, the
675
immediately preceeding range in the map.
677
result = bisect_right(range_map, key) - 1
678
if result + 1 < len(range_map):
679
# check the border condition, it may be in result + 1
680
if range_map[result + 1][0] == key[0]:
684
def _parsed_byte_index(self, offset):
685
"""Return the index of the entry immediately before offset.
687
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
688
there is one unparsed byte (the 11th, addressed as[10]). then:
689
asking for 0 will return 0
690
asking for 10 will return 0
691
asking for 11 will return 1
692
asking for 12 will return 1
695
return self._find_index(self._parsed_byte_map, key)
697
def _parsed_key_index(self, key):
698
"""Return the index of the entry immediately before key.
700
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
701
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
702
have been parsed, then:
703
asking for '' will return 0
704
asking for 'a' will return 0
705
asking for 'b' will return 1
706
asking for 'e' will return 1
708
search_key = (key, b'')
709
return self._find_index(self._parsed_key_map, search_key)
711
def _is_parsed(self, offset):
712
"""Returns True if offset has been parsed."""
713
index = self._parsed_byte_index(offset)
714
if index == len(self._parsed_byte_map):
715
return offset < self._parsed_byte_map[index - 1][1]
716
start, end = self._parsed_byte_map[index]
717
return offset >= start and offset < end
719
def _iter_entries_from_total_buffer(self, keys):
720
"""Iterate over keys when the entire index is parsed."""
721
# Note: See the note in BTreeBuilder.iter_entries for why we don't use
722
# .intersection() here
724
keys = [key for key in keys if key in nodes]
725
if self.node_ref_lists:
727
value, node_refs = nodes[key]
728
yield self, key, value, node_refs
731
yield self, key, nodes[key]
733
def iter_entries(self, keys):
734
"""Iterate over keys within the index.
736
:param keys: An iterable providing the keys to be retrieved.
737
:return: An iterable as per iter_all_entries, but restricted to the
738
keys supplied. No additional keys will be returned, and every
739
key supplied that is in the index will be returned.
744
if self._size is None and self._nodes is None:
747
# We fit about 20 keys per minimum-read (4K), so if we are looking for
748
# more than 1/20th of the index its likely (assuming homogenous key
749
# spread) that we'll read the entire index. If we're going to do that,
750
# buffer the whole thing. A better analysis might take key spread into
751
# account - but B+Tree indices are better anyway.
752
# We could look at all data read, and use a threshold there, which will
753
# trigger on ancestry walks, but that is not yet fully mapped out.
754
if self._nodes is None and len(keys) * 20 > self.key_count():
756
if self._nodes is not None:
757
return self._iter_entries_from_total_buffer(keys)
759
return (result[1] for result in bisect_multi.bisect_multi_bytes(
760
self._lookup_keys_via_location, self._size, keys))
762
def iter_entries_prefix(self, keys):
763
"""Iterate over keys within the index using prefix matching.
765
Prefix matching is applied within the tuple of a key, not to within
766
the bytestring of each key element. e.g. if you have the keys ('foo',
767
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
768
only the former key is returned.
770
WARNING: Note that this method currently causes a full index parse
771
unconditionally (which is reasonably appropriate as it is a means for
772
thunking many small indices into one larger one and still supplies
773
iter_all_entries at the thunk layer).
775
:param keys: An iterable providing the key prefixes to be retrieved.
776
Each key prefix takes the form of a tuple the length of a key, but
777
with the last N elements 'None' rather than a regular bytestring.
778
The first element cannot be 'None'.
779
:return: An iterable as per iter_all_entries, but restricted to the
780
keys with a matching prefix to those supplied. No additional keys
781
will be returned, and every match that is in the index will be
787
# load data - also finds key lengths
788
if self._nodes is None:
790
if self._key_length == 1:
792
_sanity_check_key(self, key)
793
if self.node_ref_lists:
794
value, node_refs = self._nodes[key]
795
yield self, key, value, node_refs
797
yield self, key, self._nodes[key]
799
nodes_by_key = self._get_nodes_by_key()
800
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
803
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
804
"""See BTreeIndex._find_ancestors."""
805
# The api can be implemented as a trivial overlay on top of
806
# iter_entries, it is not an efficient implementation, but it at least
810
for index, key, value, refs in self.iter_entries(keys):
811
parent_keys = refs[ref_list_num]
813
parent_map[key] = parent_keys
814
search_keys.update(parent_keys)
815
# Figure out what, if anything, was missing
816
missing_keys.update(set(keys).difference(found_keys))
817
search_keys = search_keys.difference(parent_map)
821
"""Return an estimate of the number of keys in this index.
823
For GraphIndex the estimate is exact.
825
if self._key_count is None:
826
self._read_and_parse([_HEADER_READV])
827
return self._key_count
829
def _lookup_keys_via_location(self, location_keys):
830
"""Public interface for implementing bisection.
832
If _buffer_all has been called, then all the data for the index is in
833
memory, and this method should not be called, as it uses a separate
834
cache because it cannot pre-resolve all indices, which buffer_all does
837
:param location_keys: A list of location(byte offset), key tuples.
838
:return: A list of (location_key, result) tuples as expected by
839
breezy.bisect_multi.bisect_multi_bytes.
841
# Possible improvements:
842
# - only bisect lookup each key once
843
# - sort the keys first, and use that to reduce the bisection window
845
# this progresses in three parts:
848
# attempt to answer the question from the now in memory data.
849
# build the readv request
850
# for each location, ask for 800 bytes - much more than rows we've seen
853
for location, key in location_keys:
854
# can we answer from cache?
855
if self._bisect_nodes and key in self._bisect_nodes:
856
# We have the key parsed.
858
index = self._parsed_key_index(key)
859
if (len(self._parsed_key_map) and
860
self._parsed_key_map[index][0] <= key and
861
(self._parsed_key_map[index][1] >= key or
862
# end of the file has been parsed
863
self._parsed_byte_map[index][1] == self._size)):
864
# the key has been parsed, so no lookup is needed even if its
867
# - if we have examined this part of the file already - yes
868
index = self._parsed_byte_index(location)
869
if (len(self._parsed_byte_map) and
870
self._parsed_byte_map[index][0] <= location and
871
self._parsed_byte_map[index][1] > location):
872
# the byte region has been parsed, so no read is needed.
875
if location + length > self._size:
876
length = self._size - location
877
# todo, trim out parsed locations.
879
readv_ranges.append((location, length))
880
# read the header if needed
881
if self._bisect_nodes is None:
882
readv_ranges.append(_HEADER_READV)
883
self._read_and_parse(readv_ranges)
885
if self._nodes is not None:
886
# _read_and_parse triggered a _buffer_all because we requested the
888
for location, key in location_keys:
889
if key not in self._nodes: # not present
890
result.append(((location, key), False))
891
elif self.node_ref_lists:
892
value, refs = self._nodes[key]
893
result.append(((location, key),
894
(self, key, value, refs)))
896
result.append(((location, key),
897
(self, key, self._nodes[key])))
900
# - figure out <, >, missing, present
901
# - result present references so we can return them.
902
# keys that we cannot answer until we resolve references
903
pending_references = []
904
pending_locations = set()
905
for location, key in location_keys:
906
# can we answer from cache?
907
if key in self._bisect_nodes:
908
# the key has been parsed, so no lookup is needed
909
if self.node_ref_lists:
910
# the references may not have been all parsed.
911
value, refs = self._bisect_nodes[key]
912
wanted_locations = []
913
for ref_list in refs:
915
if ref not in self._keys_by_offset:
916
wanted_locations.append(ref)
918
pending_locations.update(wanted_locations)
919
pending_references.append((location, key))
921
result.append(((location, key), (self, key,
922
value, self._resolve_references(refs))))
924
result.append(((location, key),
925
(self, key, self._bisect_nodes[key])))
928
# has the region the key should be in, been parsed?
929
index = self._parsed_key_index(key)
930
if (self._parsed_key_map[index][0] <= key and
931
(self._parsed_key_map[index][1] >= key or
932
# end of the file has been parsed
933
self._parsed_byte_map[index][1] == self._size)):
934
result.append(((location, key), False))
936
# no, is the key above or below the probed location:
937
# get the range of the probed & parsed location
938
index = self._parsed_byte_index(location)
939
# if the key is below the start of the range, its below
940
if key < self._parsed_key_map[index][0]:
944
result.append(((location, key), direction))
946
# lookup data to resolve references
947
for location in pending_locations:
949
if location + length > self._size:
950
length = self._size - location
951
# TODO: trim out parsed locations (e.g. if the 800 is into the
952
# parsed region trim it, and dont use the adjust_for_latency
955
readv_ranges.append((location, length))
956
self._read_and_parse(readv_ranges)
957
if self._nodes is not None:
958
# The _read_and_parse triggered a _buffer_all, grab the data and
960
for location, key in pending_references:
961
value, refs = self._nodes[key]
962
result.append(((location, key), (self, key, value, refs)))
964
for location, key in pending_references:
965
# answer key references we had to look-up-late.
966
value, refs = self._bisect_nodes[key]
967
result.append(((location, key), (self, key,
968
value, self._resolve_references(refs))))
971
def _parse_header_from_bytes(self, bytes):
972
"""Parse the header from a region of bytes.
974
:param bytes: The data to parse.
975
:return: An offset, data tuple such as readv yields, for the unparsed
976
data. (which may length 0).
978
signature = bytes[0:len(self._signature())]
979
if not signature == self._signature():
980
raise BadIndexFormatSignature(self._name, GraphIndex)
981
lines = bytes[len(self._signature()):].splitlines()
982
options_line = lines[0]
983
if not options_line.startswith(_OPTION_NODE_REFS):
984
raise BadIndexOptions(self)
986
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
988
raise BadIndexOptions(self)
989
options_line = lines[1]
990
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
991
raise BadIndexOptions(self)
993
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
995
raise BadIndexOptions(self)
996
options_line = lines[2]
997
if not options_line.startswith(_OPTION_LEN):
998
raise BadIndexOptions(self)
1000
self._key_count = int(options_line[len(_OPTION_LEN):])
1002
raise BadIndexOptions(self)
1003
# calculate the bytes we have processed
1004
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
1006
self._parsed_bytes(0, (), header_end, ())
1007
# setup parsing state
1008
self._expected_elements = 3 + self._key_length
1009
# raw data keyed by offset
1010
self._keys_by_offset = {}
1011
# keys with the value and node references
1012
self._bisect_nodes = {}
1013
return header_end, bytes[header_end:]
1015
def _parse_region(self, offset, data):
1016
"""Parse node data returned from a readv operation.
1018
:param offset: The byte offset the data starts at.
1019
:param data: The data to parse.
1023
end = offset + len(data)
1024
high_parsed = offset
1026
# Trivial test - if the current index's end is within the
1027
# low-matching parsed range, we're done.
1028
index = self._parsed_byte_index(high_parsed)
1029
if end < self._parsed_byte_map[index][1]:
1031
# print "[%d:%d]" % (offset, end), \
1032
# self._parsed_byte_map[index:index + 2]
1033
high_parsed, last_segment = self._parse_segment(
1034
offset, data, end, index)
1038
def _parse_segment(self, offset, data, end, index):
1039
"""Parse one segment of data.
1041
:param offset: Where 'data' begins in the file.
1042
:param data: Some data to parse a segment of.
1043
:param end: Where data ends
1044
:param index: The current index into the parsed bytes map.
1045
:return: True if the parsed segment is the last possible one in the
1047
:return: high_parsed_byte, last_segment.
1048
high_parsed_byte is the location of the highest parsed byte in this
1049
segment, last_segment is True if the parsed segment is the last
1050
possible one in the data block.
1052
# default is to use all data
1054
# accomodate overlap with data before this.
1055
if offset < self._parsed_byte_map[index][1]:
1056
# overlaps the lower parsed region
1057
# skip the parsed data
1058
trim_start = self._parsed_byte_map[index][1] - offset
1059
# don't trim the start for \n
1060
start_adjacent = True
1061
elif offset == self._parsed_byte_map[index][1]:
1062
# abuts the lower parsed region
1065
# do not trim anything
1066
start_adjacent = True
1068
# does not overlap the lower parsed region
1071
# but trim the leading \n
1072
start_adjacent = False
1073
if end == self._size:
1074
# lines up to the end of all data:
1077
# do not strip to the last \n
1080
elif index + 1 == len(self._parsed_byte_map):
1081
# at the end of the parsed data
1084
# but strip to the last \n
1085
end_adjacent = False
1087
elif end == self._parsed_byte_map[index + 1][0]:
1088
# buts up against the next parsed region
1091
# do not strip to the last \n
1094
elif end > self._parsed_byte_map[index + 1][0]:
1095
# overlaps into the next parsed region
1096
# only consider the unparsed data
1097
trim_end = self._parsed_byte_map[index + 1][0] - offset
1098
# do not strip to the last \n as we know its an entire record
1100
last_segment = end < self._parsed_byte_map[index + 1][1]
1102
# does not overlap into the next region
1105
# but strip to the last \n
1106
end_adjacent = False
1108
# now find bytes to discard if needed
1109
if not start_adjacent:
1110
# work around python bug in rfind
1111
if trim_start is None:
1112
trim_start = data.find(b'\n') + 1
1114
trim_start = data.find(b'\n', trim_start) + 1
1115
if not (trim_start != 0):
1116
raise AssertionError('no \n was present')
1117
# print 'removing start', offset, trim_start, repr(data[:trim_start])
1118
if not end_adjacent:
1119
# work around python bug in rfind
1120
if trim_end is None:
1121
trim_end = data.rfind(b'\n') + 1
1123
trim_end = data.rfind(b'\n', None, trim_end) + 1
1124
if not (trim_end != 0):
1125
raise AssertionError('no \n was present')
1126
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1127
# adjust offset and data to the parseable data.
1128
trimmed_data = data[trim_start:trim_end]
1129
if not (trimmed_data):
1130
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1131
% (trim_start, trim_end, offset, offset + len(data)))
1133
offset += trim_start
1134
# print "parsing", repr(trimmed_data)
1135
# splitlines mangles the \r delimiters.. don't use it.
1136
lines = trimmed_data.split(b'\n')
1139
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
1140
for key, value in nodes:
1141
self._bisect_nodes[key] = value
1142
self._parsed_bytes(offset, first_key,
1143
offset + len(trimmed_data), last_key)
1144
return offset + len(trimmed_data), last_segment
1146
def _parse_lines(self, lines, pos):
1153
# must be at the end
1155
if not (self._size == pos + 1):
1156
raise AssertionError("%s %s" % (self._size, pos))
1159
elements = line.split(b'\0')
1160
if len(elements) != self._expected_elements:
1161
raise BadIndexData(self)
1162
# keys are tuples. Each element is a string that may occur many
1163
# times, so we intern them to save space. AB, RC, 200807
1164
key = tuple([bytesintern(element)
1165
for element in elements[:self._key_length]])
1166
if first_key is None:
1168
absent, references, value = elements[-3:]
1170
for ref_string in references.split(b'\t'):
1171
ref_lists.append(tuple([
1172
int(ref) for ref in ref_string.split(b'\r') if ref
1174
ref_lists = tuple(ref_lists)
1175
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
1176
pos += len(line) + 1 # +1 for the \n
1179
if self.node_ref_lists:
1180
node_value = (value, ref_lists)
1183
nodes.append((key, node_value))
1184
# print "parsed ", key
1185
return first_key, key, nodes, trailers
1187
def _parsed_bytes(self, start, start_key, end, end_key):
1188
"""Mark the bytes from start to end as parsed.
1190
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
1193
:param start: The start of the parsed region.
1194
:param end: The end of the parsed region.
1196
index = self._parsed_byte_index(start)
1197
new_value = (start, end)
1198
new_key = (start_key, end_key)
1200
# first range parsed is always the beginning.
1201
self._parsed_byte_map.insert(index, new_value)
1202
self._parsed_key_map.insert(index, new_key)
1206
# extend lower region
1207
# extend higher region
1208
# combine two regions
1209
if (index + 1 < len(self._parsed_byte_map) and
1210
self._parsed_byte_map[index][1] == start and
1211
self._parsed_byte_map[index + 1][0] == end):
1212
# combine two regions
1213
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1214
self._parsed_byte_map[index + 1][1])
1215
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1216
self._parsed_key_map[index + 1][1])
1217
del self._parsed_byte_map[index + 1]
1218
del self._parsed_key_map[index + 1]
1219
elif self._parsed_byte_map[index][1] == start:
1220
# extend the lower entry
1221
self._parsed_byte_map[index] = (
1222
self._parsed_byte_map[index][0], end)
1223
self._parsed_key_map[index] = (
1224
self._parsed_key_map[index][0], end_key)
1225
elif (index + 1 < len(self._parsed_byte_map) and
1226
self._parsed_byte_map[index + 1][0] == end):
1227
# extend the higher entry
1228
self._parsed_byte_map[index + 1] = (
1229
start, self._parsed_byte_map[index + 1][1])
1230
self._parsed_key_map[index + 1] = (
1231
start_key, self._parsed_key_map[index + 1][1])
1234
self._parsed_byte_map.insert(index + 1, new_value)
1235
self._parsed_key_map.insert(index + 1, new_key)
1237
def _read_and_parse(self, readv_ranges):
1238
"""Read the ranges and parse the resulting data.
1240
:param readv_ranges: A prepared readv range list.
1242
if not readv_ranges:
1244
if self._nodes is None and self._bytes_read * 2 >= self._size:
1245
# We've already read more than 50% of the file and we are about to
1246
# request more data, just _buffer_all() and be done
1250
base_offset = self._base_offset
1251
if base_offset != 0:
1252
# Rewrite the ranges for the offset
1253
readv_ranges = [(start + base_offset, size)
1254
for start, size in readv_ranges]
1255
readv_data = self._transport.readv(self._name, readv_ranges, True,
1256
self._size + self._base_offset)
1258
for offset, data in readv_data:
1259
offset -= base_offset
1260
self._bytes_read += len(data)
1262
# transport.readv() expanded to extra data which isn't part of
1264
data = data[-offset:]
1266
if offset == 0 and len(data) == self._size:
1267
# We read the whole range, most likely because the
1268
# Transport upcast our readv ranges into one long request
1269
# for enough total data to grab the whole index.
1270
self._buffer_all(BytesIO(data))
1272
if self._bisect_nodes is None:
1273
# this must be the start
1274
if not (offset == 0):
1275
raise AssertionError()
1276
offset, data = self._parse_header_from_bytes(data)
1277
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1278
self._parse_region(offset, data)
1280
def _signature(self):
1281
"""The file signature for this index type."""
1285
"""Validate that everything in the index can be accessed."""
1286
# iter_all validates completely at the moment, so just do that.
1287
for node in self.iter_all_entries():
1291
class CombinedGraphIndex(object):
1292
"""A GraphIndex made up from smaller GraphIndices.
1294
The backing indices must implement GraphIndex, and are presumed to be
1297
Queries against the combined index will be made against the first index,
1298
and then the second and so on. The order of indices can thus influence
1299
performance significantly. For example, if one index is on local disk and a
1300
second on a remote server, the local disk index should be before the other
1303
Also, queries tend to need results from the same indices as previous
1304
queries. So the indices will be reordered after every query to put the
1305
indices that had the result(s) of that query first (while otherwise
1306
preserving the relative ordering).
1309
def __init__(self, indices, reload_func=None):
1310
"""Create a CombinedGraphIndex backed by indices.
1312
:param indices: An ordered list of indices to query for data.
1313
:param reload_func: A function to call if we find we are missing an
1314
index. Should have the form reload_func() => True/False to indicate
1315
if reloading actually changed anything.
1317
self._indices = indices
1318
self._reload_func = reload_func
1319
# Sibling indices are other CombinedGraphIndex that we should call
1320
# _move_to_front_by_name on when we auto-reorder ourself.
1321
self._sibling_indices = []
1322
# A list of names that corresponds to the instances in self._indices,
1323
# so _index_names[0] is always the name for _indices[0], etc. Sibling
1324
# indices must all use the same set of names as each other.
1325
self._index_names = [None] * len(self._indices)
1329
self.__class__.__name__,
1330
', '.join(map(repr, self._indices)))
1332
def clear_cache(self):
1333
"""See GraphIndex.clear_cache()"""
1334
for index in self._indices:
1337
def get_parent_map(self, keys):
1338
"""See graph.StackedParentsProvider.get_parent_map"""
1339
search_keys = set(keys)
1340
if _mod_revision.NULL_REVISION in search_keys:
1341
search_keys.discard(_mod_revision.NULL_REVISION)
1342
found_parents = {_mod_revision.NULL_REVISION: []}
1345
for index, key, value, refs in self.iter_entries(search_keys):
1348
parents = (_mod_revision.NULL_REVISION,)
1349
found_parents[key] = parents
1350
return found_parents
1352
__contains__ = _has_key_from_parent_map
1354
def insert_index(self, pos, index, name=None):
1355
"""Insert a new index in the list of indices to query.
1357
:param pos: The position to insert the index.
1358
:param index: The index to insert.
1359
:param name: a name for this index, e.g. a pack name. These names can
1360
be used to reflect index reorderings to related CombinedGraphIndex
1361
instances that use the same names. (see set_sibling_indices)
1363
self._indices.insert(pos, index)
1364
self._index_names.insert(pos, name)
1366
def iter_all_entries(self):
1367
"""Iterate over all keys within the index
1369
Duplicate keys across child indices are presumed to have the same
1370
value and are only reported once.
1372
:return: An iterable of (index, key, reference_lists, value).
1373
There is no defined order for the result iteration - it will be in
1374
the most efficient order for the index.
1379
for index in self._indices:
1380
for node in index.iter_all_entries():
1381
if node[1] not in seen_keys:
1383
seen_keys.add(node[1])
1385
except errors.NoSuchFile as e:
1386
if not self._try_reload(e):
1389
def iter_entries(self, keys):
1390
"""Iterate over keys within the index.
1392
Duplicate keys across child indices are presumed to have the same
1393
value and are only reported once.
1395
:param keys: An iterable providing the keys to be retrieved.
1396
:return: An iterable of (index, key, reference_lists, value). There is
1397
no defined order for the result iteration - it will be in the most
1398
efficient order for the index.
1404
for index in self._indices:
1408
for node in index.iter_entries(keys):
1409
keys.remove(node[1])
1413
hit_indices.append(index)
1415
except errors.NoSuchFile as e:
1416
if not self._try_reload(e):
1418
self._move_to_front(hit_indices)
1420
def iter_entries_prefix(self, keys):
1421
"""Iterate over keys within the index using prefix matching.
1423
Duplicate keys across child indices are presumed to have the same
1424
value and are only reported once.
1426
Prefix matching is applied within the tuple of a key, not to within
1427
the bytestring of each key element. e.g. if you have the keys ('foo',
1428
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1429
only the former key is returned.
1431
:param keys: An iterable providing the key prefixes to be retrieved.
1432
Each key prefix takes the form of a tuple the length of a key, but
1433
with the last N elements 'None' rather than a regular bytestring.
1434
The first element cannot be 'None'.
1435
:return: An iterable as per iter_all_entries, but restricted to the
1436
keys with a matching prefix to those supplied. No additional keys
1437
will be returned, and every match that is in the index will be
1447
for index in self._indices:
1449
for node in index.iter_entries_prefix(keys):
1450
if node[1] in seen_keys:
1452
seen_keys.add(node[1])
1456
hit_indices.append(index)
1458
except errors.NoSuchFile as e:
1459
if not self._try_reload(e):
1461
self._move_to_front(hit_indices)
1463
def _move_to_front(self, hit_indices):
1464
"""Rearrange self._indices so that hit_indices are first.
1466
Order is maintained as much as possible, e.g. the first unhit index
1467
will be the first index in _indices after the hit_indices, and the
1468
hit_indices will be present in exactly the order they are passed to
1471
_move_to_front propagates to all objects in self._sibling_indices by
1472
calling _move_to_front_by_name.
1474
if self._indices[:len(hit_indices)] == hit_indices:
1475
# The 'hit_indices' are already at the front (and in the same
1476
# order), no need to re-order
1478
hit_names = self._move_to_front_by_index(hit_indices)
1479
for sibling_idx in self._sibling_indices:
1480
sibling_idx._move_to_front_by_name(hit_names)
1482
def _move_to_front_by_index(self, hit_indices):
1483
"""Core logic for _move_to_front.
1485
Returns a list of names corresponding to the hit_indices param.
1487
indices_info = zip(self._index_names, self._indices)
1488
if 'index' in debug.debug_flags:
1489
indices_info = list(indices_info)
1490
trace.mutter('CombinedGraphIndex reordering: currently %r, '
1491
'promoting %r', indices_info, hit_indices)
1494
new_hit_indices = []
1497
for offset, (name, idx) in enumerate(indices_info):
1498
if idx in hit_indices:
1499
hit_names.append(name)
1500
new_hit_indices.append(idx)
1501
if len(new_hit_indices) == len(hit_indices):
1502
# We've found all of the hit entries, everything else is
1504
unhit_names.extend(self._index_names[offset + 1:])
1505
unhit_indices.extend(self._indices[offset + 1:])
1508
unhit_names.append(name)
1509
unhit_indices.append(idx)
1511
self._indices = new_hit_indices + unhit_indices
1512
self._index_names = hit_names + unhit_names
1513
if 'index' in debug.debug_flags:
1514
trace.mutter('CombinedGraphIndex reordered: %r', self._indices)
1517
def _move_to_front_by_name(self, hit_names):
1518
"""Moves indices named by 'hit_names' to front of the search order, as
1519
described in _move_to_front.
1521
# Translate names to index instances, and then call
1522
# _move_to_front_by_index.
1523
indices_info = zip(self._index_names, self._indices)
1525
for name, idx in indices_info:
1526
if name in hit_names:
1527
hit_indices.append(idx)
1528
self._move_to_front_by_index(hit_indices)
1530
def find_ancestry(self, keys, ref_list_num):
1531
"""Find the complete ancestry for the given set of keys.
1533
Note that this is a whole-ancestry request, so it should be used
1536
:param keys: An iterable of keys to look for
1537
:param ref_list_num: The reference list which references the parents
1539
:return: (parent_map, missing_keys)
1541
# XXX: make this call _move_to_front?
1542
missing_keys = set()
1544
keys_to_lookup = set(keys)
1546
while keys_to_lookup:
1547
# keys that *all* indexes claim are missing, stop searching them
1549
all_index_missing = None
1550
# print 'gen\tidx\tsub\tn_keys\tn_pmap\tn_miss'
1551
# print '%4d\t\t\t%4d\t%5d\t%5d' % (generation, len(keys_to_lookup),
1553
# len(missing_keys))
1554
for index_idx, index in enumerate(self._indices):
1555
# TODO: we should probably be doing something with
1556
# 'missing_keys' since we've already determined that
1557
# those revisions have not been found anywhere
1558
index_missing_keys = set()
1559
# Find all of the ancestry we can from this index
1560
# keep looking until the search_keys set is empty, which means
1561
# things we didn't find should be in index_missing_keys
1562
search_keys = keys_to_lookup
1564
# print ' \t%2d\t\t%4d\t%5d\t%5d' % (
1565
# index_idx, len(search_keys),
1566
# len(parent_map), len(index_missing_keys))
1569
# TODO: ref_list_num should really be a parameter, since
1570
# CombinedGraphIndex does not know what the ref lists
1572
search_keys = index._find_ancestors(search_keys,
1573
ref_list_num, parent_map, index_missing_keys)
1574
# print ' \t \t%2d\t%4d\t%5d\t%5d' % (
1575
# sub_generation, len(search_keys),
1576
# len(parent_map), len(index_missing_keys))
1577
# Now set whatever was missing to be searched in the next index
1578
keys_to_lookup = index_missing_keys
1579
if all_index_missing is None:
1580
all_index_missing = set(index_missing_keys)
1582
all_index_missing.intersection_update(index_missing_keys)
1583
if not keys_to_lookup:
1585
if all_index_missing is None:
1586
# There were no indexes, so all search keys are 'missing'
1587
missing_keys.update(keys_to_lookup)
1588
keys_to_lookup = None
1590
missing_keys.update(all_index_missing)
1591
keys_to_lookup.difference_update(all_index_missing)
1592
return parent_map, missing_keys
1594
def key_count(self):
1595
"""Return an estimate of the number of keys in this index.
1597
For CombinedGraphIndex this is approximated by the sum of the keys of
1598
the child indices. As child indices may have duplicate keys this can
1599
have a maximum error of the number of child indices * largest number of
1604
return sum((index.key_count() for index in self._indices), 0)
1605
except errors.NoSuchFile as e:
1606
if not self._try_reload(e):
1609
missing_keys = _missing_keys_from_parent_map
1611
def _try_reload(self, error):
1612
"""We just got a NoSuchFile exception.
1614
Try to reload the indices, if it fails, just raise the current
1617
if self._reload_func is None:
1620
'Trying to reload after getting exception: %s', str(error))
1621
if not self._reload_func():
1622
# We tried to reload, but nothing changed, so we fail anyway
1623
trace.mutter('_reload_func indicated nothing has changed.'
1624
' Raising original exception.')
1628
def set_sibling_indices(self, sibling_combined_graph_indices):
1629
"""Set the CombinedGraphIndex objects to reorder after reordering self.
1631
self._sibling_indices = sibling_combined_graph_indices
1634
"""Validate that everything in the index can be accessed."""
1637
for index in self._indices:
1640
except errors.NoSuchFile as e:
1641
if not self._try_reload(e):
1645
class InMemoryGraphIndex(GraphIndexBuilder):
1646
"""A GraphIndex which operates entirely out of memory and is mutable.
1648
This is designed to allow the accumulation of GraphIndex entries during a
1649
single write operation, where the accumulated entries need to be immediately
1650
available - for example via a CombinedGraphIndex.
1653
def add_nodes(self, nodes):
1654
"""Add nodes to the index.
1656
:param nodes: An iterable of (key, node_refs, value) entries to add.
1658
if self.reference_lists:
1659
for (key, value, node_refs) in nodes:
1660
self.add_node(key, value, node_refs)
1662
for (key, value) in nodes:
1663
self.add_node(key, value)
1665
def iter_all_entries(self):
1666
"""Iterate over all keys within the index
1668
:return: An iterable of (index, key, reference_lists, value). There is no
1669
defined order for the result iteration - it will be in the most
1670
efficient order for the index (in this case dictionary hash order).
1672
if 'evil' in debug.debug_flags:
1673
trace.mutter_callsite(3,
1674
"iter_all_entries scales with size of history.")
1675
if self.reference_lists:
1676
for key, (absent, references, value) in viewitems(self._nodes):
1678
yield self, key, value, references
1680
for key, (absent, references, value) in viewitems(self._nodes):
1682
yield self, key, value
1684
def iter_entries(self, keys):
1685
"""Iterate over keys within the index.
1687
:param keys: An iterable providing the keys to be retrieved.
1688
:return: An iterable of (index, key, value, reference_lists). There is no
1689
defined order for the result iteration - it will be in the most
1690
efficient order for the index (keys iteration order in this case).
1692
# Note: See BTreeBuilder.iter_entries for an explanation of why we
1693
# aren't using set().intersection() here
1695
keys = [key for key in keys if key in nodes]
1696
if self.reference_lists:
1700
yield self, key, node[2], node[1]
1705
yield self, key, node[2]
1707
def iter_entries_prefix(self, keys):
1708
"""Iterate over keys within the index using prefix matching.
1710
Prefix matching is applied within the tuple of a key, not to within
1711
the bytestring of each key element. e.g. if you have the keys ('foo',
1712
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1713
only the former key is returned.
1715
:param keys: An iterable providing the key prefixes to be retrieved.
1716
Each key prefix takes the form of a tuple the length of a key, but
1717
with the last N elements 'None' rather than a regular bytestring.
1718
The first element cannot be 'None'.
1719
:return: An iterable as per iter_all_entries, but restricted to the
1720
keys with a matching prefix to those supplied. No additional keys
1721
will be returned, and every match that is in the index will be
1727
if self._key_length == 1:
1729
_sanity_check_key(self, key)
1730
node = self._nodes[key]
1733
if self.reference_lists:
1734
yield self, key, node[2], node[1]
1736
yield self, key, node[2]
1738
nodes_by_key = self._get_nodes_by_key()
1739
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
1742
def key_count(self):
1743
"""Return an estimate of the number of keys in this index.
1745
For InMemoryGraphIndex the estimate is exact.
1747
return len(self._nodes) - len(self._absent_keys)
1750
"""In memory index's have no known corruption at the moment."""
1752
def __lt__(self, other):
1753
# We don't really care about the order, just that there is an order.
1754
if (not isinstance(other, GraphIndex) and
1755
not isinstance(other, InMemoryGraphIndex)):
1756
raise TypeError(other)
1757
return hash(self) < hash(other)
1760
class GraphIndexPrefixAdapter(object):
1761
"""An adapter between GraphIndex with different key lengths.
1763
Queries against this will emit queries against the adapted Graph with the
1764
prefix added, queries for all items use iter_entries_prefix. The returned
1765
nodes will have their keys and node references adjusted to remove the
1766
prefix. Finally, an add_nodes_callback can be supplied - when called the
1767
nodes and references being added will have prefix prepended.
1770
def __init__(self, adapted, prefix, missing_key_length,
1771
add_nodes_callback=None):
1772
"""Construct an adapter against adapted with prefix."""
1773
self.adapted = adapted
1774
self.prefix_key = prefix + (None,) * missing_key_length
1775
self.prefix = prefix
1776
self.prefix_len = len(prefix)
1777
self.add_nodes_callback = add_nodes_callback
1779
def add_nodes(self, nodes):
1780
"""Add nodes to the index.
1782
:param nodes: An iterable of (key, node_refs, value) entries to add.
1784
# save nodes in case its an iterator
1785
nodes = tuple(nodes)
1786
translated_nodes = []
1788
# Add prefix_key to each reference node_refs is a tuple of tuples,
1789
# so split it apart, and add prefix_key to the internal reference
1790
for (key, value, node_refs) in nodes:
1791
adjusted_references = (
1792
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1793
for ref_list in node_refs))
1794
translated_nodes.append((self.prefix + key, value,
1795
adjusted_references))
1797
# XXX: TODO add an explicit interface for getting the reference list
1798
# status, to handle this bit of user-friendliness in the API more
1800
for (key, value) in nodes:
1801
translated_nodes.append((self.prefix + key, value))
1802
self.add_nodes_callback(translated_nodes)
1804
def add_node(self, key, value, references=()):
1805
"""Add a node to the index.
1807
:param key: The key. keys are non-empty tuples containing
1808
as many whitespace-free utf8 bytestrings as the key length
1809
defined for this index.
1810
:param references: An iterable of iterables of keys. Each is a
1811
reference to another key.
1812
:param value: The value to associate with the key. It may be any
1813
bytes as long as it does not contain \0 or \n.
1815
self.add_nodes(((key, value, references), ))
1817
def _strip_prefix(self, an_iter):
1818
"""Strip prefix data from nodes and return it."""
1819
for node in an_iter:
1821
if node[1][:self.prefix_len] != self.prefix:
1822
raise BadIndexData(self)
1823
for ref_list in node[3]:
1824
for ref_node in ref_list:
1825
if ref_node[:self.prefix_len] != self.prefix:
1826
raise BadIndexData(self)
1827
yield node[0], node[1][self.prefix_len:], node[2], (
1828
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1829
for ref_list in node[3]))
1831
def iter_all_entries(self):
1832
"""Iterate over all keys within the index
1834
iter_all_entries is implemented against the adapted index using
1835
iter_entries_prefix.
1837
:return: An iterable of (index, key, reference_lists, value). There is no
1838
defined order for the result iteration - it will be in the most
1839
efficient order for the index (in this case dictionary hash order).
1841
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1843
def iter_entries(self, keys):
1844
"""Iterate over keys within the index.
1846
:param keys: An iterable providing the keys to be retrieved.
1847
:return: An iterable of (index, key, value, reference_lists). There is no
1848
defined order for the result iteration - it will be in the most
1849
efficient order for the index (keys iteration order in this case).
1851
return self._strip_prefix(self.adapted.iter_entries(
1852
self.prefix + key for key in keys))
1854
def iter_entries_prefix(self, keys):
1855
"""Iterate over keys within the index using prefix matching.
1857
Prefix matching is applied within the tuple of a key, not to within
1858
the bytestring of each key element. e.g. if you have the keys ('foo',
1859
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1860
only the former key is returned.
1862
:param keys: An iterable providing the key prefixes to be retrieved.
1863
Each key prefix takes the form of a tuple the length of a key, but
1864
with the last N elements 'None' rather than a regular bytestring.
1865
The first element cannot be 'None'.
1866
:return: An iterable as per iter_all_entries, but restricted to the
1867
keys with a matching prefix to those supplied. No additional keys
1868
will be returned, and every match that is in the index will be
1871
return self._strip_prefix(self.adapted.iter_entries_prefix(
1872
self.prefix + key for key in keys))
1874
def key_count(self):
1875
"""Return an estimate of the number of keys in this index.
1877
For GraphIndexPrefixAdapter this is relatively expensive - key
1878
iteration with the prefix is done.
1880
return len(list(self.iter_all_entries()))
1883
"""Call the adapted's validate."""
1884
self.adapted.validate()
1887
def _sanity_check_key(index_or_builder, key):
1888
"""Raise BadIndexKey if key cannot be used for prefix matching."""
1890
raise BadIndexKey(key)
1891
if len(key) != index_or_builder._key_length:
1892
raise BadIndexKey(key)
1895
def _iter_entries_prefix(index_or_builder, nodes_by_key, keys):
1896
"""Helper for implementing prefix matching iterators."""
1898
_sanity_check_key(index_or_builder, key)
1899
# find what it refers to:
1900
key_dict = nodes_by_key
1901
elements = list(key)
1902
# find the subdict whose contents should be returned.
1904
while len(elements) and elements[0] is not None:
1905
key_dict = key_dict[elements[0]]
1908
# a non-existant lookup.
1913
values_view = viewvalues(dicts.pop())
1914
# can't be empty or would not exist
1915
value = next(iter(values_view))
1916
if isinstance(value, dict):
1917
# still descending, push values
1918
dicts.extend(values_view)
1920
# at leaf tuples, yield values
1921
for value in values_view:
1922
# each value is the key:value:node refs tuple
1924
yield (index_or_builder, ) + value
1926
# the last thing looked up was a terminal element
1927
yield (index_or_builder, ) + key_dict