1
# Copyright (C) 2007, 2008, 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Indexing facilities."""
23
'GraphIndexPrefixAdapter',
27
from bisect import bisect_right
28
from cStringIO import StringIO
32
from bzrlib.lazy_import import lazy_import
33
lazy_import(globals(), """
34
from bzrlib import trace
35
from bzrlib.bisect_multi import bisect_multi_bytes
36
from bzrlib.revision import NULL_REVISION
37
from bzrlib.trace import mutter
43
from bzrlib.static_tuple import StaticTuple
45
_HEADER_READV = (0, 200)
46
_OPTION_KEY_ELEMENTS = "key_elements="
48
_OPTION_NODE_REFS = "node_ref_lists="
49
_SIGNATURE = "Bazaar Graph Index 1\n"
52
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
53
_newline_null_re = re.compile('[\n\0]')
56
def _has_key_from_parent_map(self, key):
57
"""Check if this index has one key.
59
If it's possible to check for multiple keys at once through
60
calling get_parent_map that should be faster.
62
return (key in self.get_parent_map([key]))
65
def _missing_keys_from_parent_map(self, keys):
66
return set(keys) - set(self.get_parent_map(keys))
69
class GraphIndexBuilder(object):
70
"""A builder that can build a GraphIndex.
72
The resulting graph has the structure:
74
_SIGNATURE OPTIONS NODES NEWLINE
75
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
76
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
78
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
79
KEY := Not-whitespace-utf8
81
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
82
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
83
REFERENCE := DIGITS ; digits is the byte offset in the index of the
85
VALUE := no-newline-no-null-bytes
88
def __init__(self, reference_lists=0, key_elements=1):
89
"""Create a GraphIndex builder.
91
:param reference_lists: The number of node references lists for each
93
:param key_elements: The number of bytestrings in each key.
95
self.reference_lists = reference_lists
97
# A dict of {key: (absent, ref_lists, value)}
99
self._nodes_by_key = None
100
self._key_length = key_elements
101
self._optimize_for_size = False
102
self._combine_backing_indices = True
104
def _check_key(self, key):
105
"""Raise BadIndexKey if key is not a valid key for this index."""
106
if type(key) not in (tuple, StaticTuple):
107
raise errors.BadIndexKey(key)
108
if self._key_length != len(key):
109
raise errors.BadIndexKey(key)
111
if not element or _whitespace_re.search(element) is not None:
112
raise errors.BadIndexKey(element)
114
def _external_references(self):
115
"""Return references that are not present in this index.
119
# TODO: JAM 2008-11-21 This makes an assumption about how the reference
120
# lists are used. It is currently correct for pack-0.92 through
121
# 1.9, which use the node references (3rd column) second
122
# reference list as the compression parent. Perhaps this should
123
# be moved into something higher up the stack, since it
124
# makes assumptions about how the index is used.
125
if self.reference_lists > 1:
126
for node in self.iter_all_entries():
128
refs.update(node[3][1])
131
# If reference_lists == 0 there can be no external references, and
132
# if reference_lists == 1, then there isn't a place to store the
136
def _get_nodes_by_key(self):
137
if self._nodes_by_key is None:
139
if self.reference_lists:
140
for key, (absent, references, value) in self._nodes.iteritems():
143
key_dict = nodes_by_key
144
for subkey in key[:-1]:
145
key_dict = key_dict.setdefault(subkey, {})
146
key_dict[key[-1]] = key, value, references
148
for key, (absent, references, value) in self._nodes.iteritems():
151
key_dict = nodes_by_key
152
for subkey in key[:-1]:
153
key_dict = key_dict.setdefault(subkey, {})
154
key_dict[key[-1]] = key, value
155
self._nodes_by_key = nodes_by_key
156
return self._nodes_by_key
158
def _update_nodes_by_key(self, key, value, node_refs):
159
"""Update the _nodes_by_key dict with a new key.
161
For a key of (foo, bar, baz) create
162
_nodes_by_key[foo][bar][baz] = key_value
164
if self._nodes_by_key is None:
166
key_dict = self._nodes_by_key
167
if self.reference_lists:
168
key_value = key, value, node_refs
170
key_value = key, value
171
for subkey in key[:-1]:
172
key_dict = key_dict.setdefault(subkey, {})
173
key_dict[key[-1]] = key_value
175
def _check_key_ref_value(self, key, references, value):
176
"""Check that 'key' and 'references' are all valid.
178
:param key: A key tuple. Must conform to the key interface (be a tuple,
179
be of the right length, not have any whitespace or nulls in any key
181
:param references: An iterable of reference lists. Something like
182
[[(ref, key)], [(ref, key), (other, key)]]
183
:param value: The value associate with this key. Must not contain
184
newlines or null characters.
185
:return: (node_refs, absent_references)
186
node_refs basically a packed form of 'references' where all
188
absent_references reference keys that are not in self._nodes.
189
This may contain duplicates if the same key is
190
referenced in multiple lists.
193
if _newline_null_re.search(value) is not None:
194
raise errors.BadIndexValue(value)
195
if len(references) != self.reference_lists:
196
raise errors.BadIndexValue(references)
198
absent_references = []
199
for reference_list in references:
200
for reference in reference_list:
201
# If reference *is* in self._nodes, then we know it has already
203
if reference not in self._nodes:
204
self._check_key(reference)
205
absent_references.append(reference)
207
node_refs.append(tuple(reference_list))
209
return tuple(node_refs), absent_references
211
def add_node(self, key, value, references=()):
212
"""Add a node to the index.
214
:param key: The key. keys are non-empty tuples containing
215
as many whitespace-free utf8 bytestrings as the key length
216
defined for this index.
217
:param references: An iterable of iterables of keys. Each is a
218
reference to another key.
219
:param value: The value to associate with the key. It may be any
220
bytes as long as it does not contain \0 or \n.
223
absent_references) = self._check_key_ref_value(key, references, value)
224
if key in self._nodes and self._nodes[key][0] != 'a':
225
raise errors.BadIndexDuplicateKey(key, self)
226
for reference in absent_references:
227
# There may be duplicates, but I don't think it is worth worrying
229
self._nodes[reference] = ('a', (), '')
230
self._nodes[key] = ('', node_refs, value)
232
if self._nodes_by_key is not None and self._key_length > 1:
233
self._update_nodes_by_key(key, value, node_refs)
237
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
238
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
239
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
240
prefix_length = sum(len(x) for x in lines)
241
# references are byte offsets. To avoid having to do nasty
242
# polynomial work to resolve offsets (references to later in the
243
# file cannot be determined until all the inbetween references have
244
# been calculated too) we pad the offsets with 0's to make them be
245
# of consistent length. Using binary offsets would break the trivial
247
# to calculate the width of zero's needed we do three passes:
248
# one to gather all the non-reference data and the number of references.
249
# one to pad all the data with reference-length and determine entry
253
# forward sorted by key. In future we may consider topological sorting,
254
# at the cost of table scans for direct lookup, or a second index for
256
nodes = sorted(self._nodes.items())
257
# if we do not prepass, we don't know how long it will be up front.
258
expected_bytes = None
259
# we only need to pre-pass if we have reference lists at all.
260
if self.reference_lists:
262
non_ref_bytes = prefix_length
264
# TODO use simple multiplication for the constants in this loop.
265
for key, (absent, references, value) in nodes:
266
# record the offset known *so far* for this key:
267
# the non reference bytes to date, and the total references to
268
# date - saves reaccumulating on the second pass
269
key_offset_info.append((key, non_ref_bytes, total_references))
270
# key is literal, value is literal, there are 3 null's, 1 NL
271
# key is variable length tuple, \x00 between elements
272
non_ref_bytes += sum(len(element) for element in key)
273
if self._key_length > 1:
274
non_ref_bytes += self._key_length - 1
275
# value is literal bytes, there are 3 null's, 1 NL.
276
non_ref_bytes += len(value) + 3 + 1
277
# one byte for absent if set.
280
elif self.reference_lists:
281
# (ref_lists -1) tabs
282
non_ref_bytes += self.reference_lists - 1
283
# (ref-1 cr's per ref_list)
284
for ref_list in references:
285
# how many references across the whole file?
286
total_references += len(ref_list)
287
# accrue reference separators
289
non_ref_bytes += len(ref_list) - 1
290
# how many digits are needed to represent the total byte count?
292
possible_total_bytes = non_ref_bytes + total_references*digits
293
while 10 ** digits < possible_total_bytes:
295
possible_total_bytes = non_ref_bytes + total_references*digits
296
expected_bytes = possible_total_bytes + 1 # terminating newline
297
# resolve key addresses.
299
for key, non_ref_bytes, total_references in key_offset_info:
300
key_addresses[key] = non_ref_bytes + total_references*digits
302
format_string = '%%0%sd' % digits
303
for key, (absent, references, value) in nodes:
304
flattened_references = []
305
for ref_list in references:
307
for reference in ref_list:
308
ref_addresses.append(format_string % key_addresses[reference])
309
flattened_references.append('\r'.join(ref_addresses))
310
string_key = '\x00'.join(key)
311
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
312
'\t'.join(flattened_references), value))
314
result = StringIO(''.join(lines))
315
if expected_bytes and len(result.getvalue()) != expected_bytes:
316
raise errors.BzrError('Failed index creation. Internal error:'
317
' mismatched output length and expected length: %d %d' %
318
(len(result.getvalue()), expected_bytes))
321
def set_optimize(self, for_size=None, combine_backing_indices=None):
322
"""Change how the builder tries to optimize the result.
324
:param for_size: Tell the builder to try and make the index as small as
326
:param combine_backing_indices: If the builder spills to disk to save
327
memory, should the on-disk indices be combined. Set to True if you
328
are going to be probing the index, but to False if you are not. (If
329
you are not querying, then the time spent combining is wasted.)
332
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
334
if for_size is not None:
335
self._optimize_for_size = for_size
336
if combine_backing_indices is not None:
337
self._combine_backing_indices = combine_backing_indices
339
def find_ancestry(self, keys, ref_list_num):
340
"""See CombinedGraphIndex.find_ancestry()"""
346
for _, key, value, ref_lists in self.iter_entries(pending):
347
parent_keys = ref_lists[ref_list_num]
348
parent_map[key] = parent_keys
349
next_pending.update([p for p in parent_keys if p not in
351
missing_keys.update(pending.difference(parent_map))
352
pending = next_pending
353
return parent_map, missing_keys
356
class GraphIndex(object):
357
"""An index for data with embedded graphs.
359
The index maps keys to a list of key reference lists, and a value.
360
Each node has the same number of key reference lists. Each key reference
361
list can be empty or an arbitrary length. The value is an opaque NULL
362
terminated string without any newlines. The storage of the index is
363
hidden in the interface: keys and key references are always tuples of
364
bytestrings, never the internal representation (e.g. dictionary offsets).
366
It is presumed that the index will not be mutated - it is static data.
368
Successive iter_all_entries calls will read the entire index each time.
369
Additionally, iter_entries calls will read the index linearly until the
370
desired keys are found. XXX: This must be fixed before the index is
371
suitable for production use. :XXX
374
def __init__(self, transport, name, size, unlimited_cache=False):
375
"""Open an index called name on transport.
377
:param transport: A bzrlib.transport.Transport.
378
:param name: A path to provide to transport API calls.
379
:param size: The size of the index in bytes. This is used for bisection
380
logic to perform partial index reads. While the size could be
381
obtained by statting the file this introduced an additional round
382
trip as well as requiring stat'able transports, both of which are
383
avoided by having it supplied. If size is None, then bisection
384
support will be disabled and accessing the index will just stream
387
self._transport = transport
389
# Becomes a dict of key:(value, reference-list-byte-locations) used by
390
# the bisection interface to store parsed but not resolved keys.
391
self._bisect_nodes = None
392
# Becomes a dict of key:(value, reference-list-keys) which are ready to
393
# be returned directly to callers.
395
# a sorted list of slice-addresses for the parsed bytes of the file.
396
# e.g. (0,1) would mean that byte 0 is parsed.
397
self._parsed_byte_map = []
398
# a sorted list of keys matching each slice address for parsed bytes
399
# e.g. (None, 'foo@bar') would mean that the first byte contained no
400
# key, and the end byte of the slice is the of the data for 'foo@bar'
401
self._parsed_key_map = []
402
self._key_count = None
403
self._keys_by_offset = None
404
self._nodes_by_key = None
406
# The number of bytes we've read so far in trying to process this file
409
def __eq__(self, other):
410
"""Equal when self and other were created with the same parameters."""
412
type(self) == type(other) and
413
self._transport == other._transport and
414
self._name == other._name and
415
self._size == other._size)
417
def __ne__(self, other):
418
return not self.__eq__(other)
421
return "%s(%r)" % (self.__class__.__name__,
422
self._transport.abspath(self._name))
424
def _buffer_all(self, stream=None):
425
"""Buffer all the index data.
427
Mutates self._nodes and self.keys_by_offset.
429
if self._nodes is not None:
430
# We already did this
432
if 'index' in debug.debug_flags:
433
mutter('Reading entire index %s', self._transport.abspath(self._name))
435
stream = self._transport.get(self._name)
436
self._read_prefix(stream)
437
self._expected_elements = 3 + self._key_length
439
# raw data keyed by offset
440
self._keys_by_offset = {}
441
# ready-to-return key:value or key:value, node_ref_lists
443
self._nodes_by_key = None
446
lines = stream.read().split('\n')
448
_, _, _, trailers = self._parse_lines(lines, pos)
449
for key, absent, references, value in self._keys_by_offset.itervalues():
452
# resolve references:
453
if self.node_ref_lists:
454
node_value = (value, self._resolve_references(references))
457
self._nodes[key] = node_value
458
# cache the keys for quick set intersections
459
self._keys = set(self._nodes)
461
# there must be one line - the empty trailer line.
462
raise errors.BadIndexData(self)
464
def external_references(self, ref_list_num):
465
"""Return references that are not present in this index.
468
if ref_list_num + 1 > self.node_ref_lists:
469
raise ValueError('No ref list %d, index has %d ref lists'
470
% (ref_list_num, self.node_ref_lists))
472
for key, (value, ref_lists) in self._nodes.iteritems():
473
ref_list = ref_lists[ref_list_num]
474
refs.update(ref_list)
475
return refs - self._keys
477
def _get_nodes_by_key(self):
478
if self._nodes_by_key is None:
480
if self.node_ref_lists:
481
for key, (value, references) in self._nodes.iteritems():
482
key_dict = nodes_by_key
483
for subkey in key[:-1]:
484
key_dict = key_dict.setdefault(subkey, {})
485
key_dict[key[-1]] = key, value, references
487
for key, value in self._nodes.iteritems():
488
key_dict = nodes_by_key
489
for subkey in key[:-1]:
490
key_dict = key_dict.setdefault(subkey, {})
491
key_dict[key[-1]] = key, value
492
self._nodes_by_key = nodes_by_key
493
return self._nodes_by_key
495
def iter_all_entries(self):
496
"""Iterate over all keys within the index.
498
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
499
The former tuple is used when there are no reference lists in the
500
index, making the API compatible with simple key:value index types.
501
There is no defined order for the result iteration - it will be in
502
the most efficient order for the index.
504
if 'evil' in debug.debug_flags:
505
trace.mutter_callsite(3,
506
"iter_all_entries scales with size of history.")
507
if self._nodes is None:
509
if self.node_ref_lists:
510
for key, (value, node_ref_lists) in self._nodes.iteritems():
511
yield self, key, value, node_ref_lists
513
for key, value in self._nodes.iteritems():
514
yield self, key, value
516
def _read_prefix(self, stream):
517
signature = stream.read(len(self._signature()))
518
if not signature == self._signature():
519
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
520
options_line = stream.readline()
521
if not options_line.startswith(_OPTION_NODE_REFS):
522
raise errors.BadIndexOptions(self)
524
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
526
raise errors.BadIndexOptions(self)
527
options_line = stream.readline()
528
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
529
raise errors.BadIndexOptions(self)
531
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
533
raise errors.BadIndexOptions(self)
534
options_line = stream.readline()
535
if not options_line.startswith(_OPTION_LEN):
536
raise errors.BadIndexOptions(self)
538
self._key_count = int(options_line[len(_OPTION_LEN):-1])
540
raise errors.BadIndexOptions(self)
542
def _resolve_references(self, references):
543
"""Return the resolved key references for references.
545
References are resolved by looking up the location of the key in the
546
_keys_by_offset map and substituting the key name, preserving ordering.
548
:param references: An iterable of iterables of key locations. e.g.
550
:return: A tuple of tuples of keys.
553
for ref_list in references:
554
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
555
return tuple(node_refs)
557
def _find_index(self, range_map, key):
558
"""Helper for the _parsed_*_index calls.
560
Given a range map - [(start, end), ...], finds the index of the range
561
in the map for key if it is in the map, and if it is not there, the
562
immediately preceeding range in the map.
564
result = bisect_right(range_map, key) - 1
565
if result + 1 < len(range_map):
566
# check the border condition, it may be in result + 1
567
if range_map[result + 1][0] == key[0]:
571
def _parsed_byte_index(self, offset):
572
"""Return the index of the entry immediately before offset.
574
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
575
there is one unparsed byte (the 11th, addressed as[10]). then:
576
asking for 0 will return 0
577
asking for 10 will return 0
578
asking for 11 will return 1
579
asking for 12 will return 1
582
return self._find_index(self._parsed_byte_map, key)
584
def _parsed_key_index(self, key):
585
"""Return the index of the entry immediately before key.
587
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
588
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
589
have been parsed, then:
590
asking for '' will return 0
591
asking for 'a' will return 0
592
asking for 'b' will return 1
593
asking for 'e' will return 1
595
search_key = (key, None)
596
return self._find_index(self._parsed_key_map, search_key)
598
def _is_parsed(self, offset):
599
"""Returns True if offset has been parsed."""
600
index = self._parsed_byte_index(offset)
601
if index == len(self._parsed_byte_map):
602
return offset < self._parsed_byte_map[index - 1][1]
603
start, end = self._parsed_byte_map[index]
604
return offset >= start and offset < end
606
def _iter_entries_from_total_buffer(self, keys):
607
"""Iterate over keys when the entire index is parsed."""
608
keys = keys.intersection(self._keys)
609
if self.node_ref_lists:
611
value, node_refs = self._nodes[key]
612
yield self, key, value, node_refs
615
yield self, key, self._nodes[key]
617
def iter_entries(self, keys):
618
"""Iterate over keys within the index.
620
:param keys: An iterable providing the keys to be retrieved.
621
:return: An iterable as per iter_all_entries, but restricted to the
622
keys supplied. No additional keys will be returned, and every
623
key supplied that is in the index will be returned.
628
if self._size is None and self._nodes is None:
631
# We fit about 20 keys per minimum-read (4K), so if we are looking for
632
# more than 1/20th of the index its likely (assuming homogenous key
633
# spread) that we'll read the entire index. If we're going to do that,
634
# buffer the whole thing. A better analysis might take key spread into
635
# account - but B+Tree indices are better anyway.
636
# We could look at all data read, and use a threshold there, which will
637
# trigger on ancestry walks, but that is not yet fully mapped out.
638
if self._nodes is None and len(keys) * 20 > self.key_count():
640
if self._nodes is not None:
641
return self._iter_entries_from_total_buffer(keys)
643
return (result[1] for result in bisect_multi_bytes(
644
self._lookup_keys_via_location, self._size, keys))
646
def iter_entries_prefix(self, keys):
647
"""Iterate over keys within the index using prefix matching.
649
Prefix matching is applied within the tuple of a key, not to within
650
the bytestring of each key element. e.g. if you have the keys ('foo',
651
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
652
only the former key is returned.
654
WARNING: Note that this method currently causes a full index parse
655
unconditionally (which is reasonably appropriate as it is a means for
656
thunking many small indices into one larger one and still supplies
657
iter_all_entries at the thunk layer).
659
:param keys: An iterable providing the key prefixes to be retrieved.
660
Each key prefix takes the form of a tuple the length of a key, but
661
with the last N elements 'None' rather than a regular bytestring.
662
The first element cannot be 'None'.
663
:return: An iterable as per iter_all_entries, but restricted to the
664
keys with a matching prefix to those supplied. No additional keys
665
will be returned, and every match that is in the index will be
671
# load data - also finds key lengths
672
if self._nodes is None:
674
if self._key_length == 1:
678
raise errors.BadIndexKey(key)
679
if len(key) != self._key_length:
680
raise errors.BadIndexKey(key)
681
if self.node_ref_lists:
682
value, node_refs = self._nodes[key]
683
yield self, key, value, node_refs
685
yield self, key, self._nodes[key]
687
nodes_by_key = self._get_nodes_by_key()
691
raise errors.BadIndexKey(key)
692
if len(key) != self._key_length:
693
raise errors.BadIndexKey(key)
694
# find what it refers to:
695
key_dict = nodes_by_key
697
# find the subdict whose contents should be returned.
699
while len(elements) and elements[0] is not None:
700
key_dict = key_dict[elements[0]]
703
# a non-existant lookup.
708
key_dict = dicts.pop(-1)
709
# can't be empty or would not exist
710
item, value = key_dict.iteritems().next()
711
if type(value) == dict:
713
dicts.extend(key_dict.itervalues())
716
for value in key_dict.itervalues():
717
# each value is the key:value:node refs tuple
719
yield (self, ) + value
721
# the last thing looked up was a terminal element
722
yield (self, ) + key_dict
724
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
725
"""See BTreeIndex._find_ancestors."""
726
# The api can be implemented as a trivial overlay on top of
727
# iter_entries, it is not an efficient implementation, but it at least
731
for index, key, value, refs in self.iter_entries(keys):
732
parent_keys = refs[ref_list_num]
734
parent_map[key] = parent_keys
735
search_keys.update(parent_keys)
736
# Figure out what, if anything, was missing
737
missing_keys.update(set(keys).difference(found_keys))
738
search_keys = search_keys.difference(parent_map)
742
"""Return an estimate of the number of keys in this index.
744
For GraphIndex the estimate is exact.
746
if self._key_count is None:
747
self._read_and_parse([_HEADER_READV])
748
return self._key_count
750
def _lookup_keys_via_location(self, location_keys):
751
"""Public interface for implementing bisection.
753
If _buffer_all has been called, then all the data for the index is in
754
memory, and this method should not be called, as it uses a separate
755
cache because it cannot pre-resolve all indices, which buffer_all does
758
:param location_keys: A list of location(byte offset), key tuples.
759
:return: A list of (location_key, result) tuples as expected by
760
bzrlib.bisect_multi.bisect_multi_bytes.
762
# Possible improvements:
763
# - only bisect lookup each key once
764
# - sort the keys first, and use that to reduce the bisection window
766
# this progresses in three parts:
769
# attempt to answer the question from the now in memory data.
770
# build the readv request
771
# for each location, ask for 800 bytes - much more than rows we've seen
774
for location, key in location_keys:
775
# can we answer from cache?
776
if self._bisect_nodes and key in self._bisect_nodes:
777
# We have the key parsed.
779
index = self._parsed_key_index(key)
780
if (len(self._parsed_key_map) and
781
self._parsed_key_map[index][0] <= key and
782
(self._parsed_key_map[index][1] >= key or
783
# end of the file has been parsed
784
self._parsed_byte_map[index][1] == self._size)):
785
# the key has been parsed, so no lookup is needed even if its
788
# - if we have examined this part of the file already - yes
789
index = self._parsed_byte_index(location)
790
if (len(self._parsed_byte_map) and
791
self._parsed_byte_map[index][0] <= location and
792
self._parsed_byte_map[index][1] > location):
793
# the byte region has been parsed, so no read is needed.
796
if location + length > self._size:
797
length = self._size - location
798
# todo, trim out parsed locations.
800
readv_ranges.append((location, length))
801
# read the header if needed
802
if self._bisect_nodes is None:
803
readv_ranges.append(_HEADER_READV)
804
self._read_and_parse(readv_ranges)
806
if self._nodes is not None:
807
# _read_and_parse triggered a _buffer_all because we requested the
809
for location, key in location_keys:
810
if key not in self._nodes: # not present
811
result.append(((location, key), False))
812
elif self.node_ref_lists:
813
value, refs = self._nodes[key]
814
result.append(((location, key),
815
(self, key, value, refs)))
817
result.append(((location, key),
818
(self, key, self._nodes[key])))
821
# - figure out <, >, missing, present
822
# - result present references so we can return them.
823
# keys that we cannot answer until we resolve references
824
pending_references = []
825
pending_locations = set()
826
for location, key in location_keys:
827
# can we answer from cache?
828
if key in self._bisect_nodes:
829
# the key has been parsed, so no lookup is needed
830
if self.node_ref_lists:
831
# the references may not have been all parsed.
832
value, refs = self._bisect_nodes[key]
833
wanted_locations = []
834
for ref_list in refs:
836
if ref not in self._keys_by_offset:
837
wanted_locations.append(ref)
839
pending_locations.update(wanted_locations)
840
pending_references.append((location, key))
842
result.append(((location, key), (self, key,
843
value, self._resolve_references(refs))))
845
result.append(((location, key),
846
(self, key, self._bisect_nodes[key])))
849
# has the region the key should be in, been parsed?
850
index = self._parsed_key_index(key)
851
if (self._parsed_key_map[index][0] <= key and
852
(self._parsed_key_map[index][1] >= key or
853
# end of the file has been parsed
854
self._parsed_byte_map[index][1] == self._size)):
855
result.append(((location, key), False))
857
# no, is the key above or below the probed location:
858
# get the range of the probed & parsed location
859
index = self._parsed_byte_index(location)
860
# if the key is below the start of the range, its below
861
if key < self._parsed_key_map[index][0]:
865
result.append(((location, key), direction))
867
# lookup data to resolve references
868
for location in pending_locations:
870
if location + length > self._size:
871
length = self._size - location
872
# TODO: trim out parsed locations (e.g. if the 800 is into the
873
# parsed region trim it, and dont use the adjust_for_latency
876
readv_ranges.append((location, length))
877
self._read_and_parse(readv_ranges)
878
if self._nodes is not None:
879
# The _read_and_parse triggered a _buffer_all, grab the data and
881
for location, key in pending_references:
882
value, refs = self._nodes[key]
883
result.append(((location, key), (self, key, value, refs)))
885
for location, key in pending_references:
886
# answer key references we had to look-up-late.
887
value, refs = self._bisect_nodes[key]
888
result.append(((location, key), (self, key,
889
value, self._resolve_references(refs))))
892
def _parse_header_from_bytes(self, bytes):
893
"""Parse the header from a region of bytes.
895
:param bytes: The data to parse.
896
:return: An offset, data tuple such as readv yields, for the unparsed
897
data. (which may length 0).
899
signature = bytes[0:len(self._signature())]
900
if not signature == self._signature():
901
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
902
lines = bytes[len(self._signature()):].splitlines()
903
options_line = lines[0]
904
if not options_line.startswith(_OPTION_NODE_REFS):
905
raise errors.BadIndexOptions(self)
907
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
909
raise errors.BadIndexOptions(self)
910
options_line = lines[1]
911
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
912
raise errors.BadIndexOptions(self)
914
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
916
raise errors.BadIndexOptions(self)
917
options_line = lines[2]
918
if not options_line.startswith(_OPTION_LEN):
919
raise errors.BadIndexOptions(self)
921
self._key_count = int(options_line[len(_OPTION_LEN):])
923
raise errors.BadIndexOptions(self)
924
# calculate the bytes we have processed
925
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
927
self._parsed_bytes(0, None, header_end, None)
928
# setup parsing state
929
self._expected_elements = 3 + self._key_length
930
# raw data keyed by offset
931
self._keys_by_offset = {}
932
# keys with the value and node references
933
self._bisect_nodes = {}
934
return header_end, bytes[header_end:]
936
def _parse_region(self, offset, data):
937
"""Parse node data returned from a readv operation.
939
:param offset: The byte offset the data starts at.
940
:param data: The data to parse.
944
end = offset + len(data)
947
# Trivial test - if the current index's end is within the
948
# low-matching parsed range, we're done.
949
index = self._parsed_byte_index(high_parsed)
950
if end < self._parsed_byte_map[index][1]:
952
# print "[%d:%d]" % (offset, end), \
953
# self._parsed_byte_map[index:index + 2]
954
high_parsed, last_segment = self._parse_segment(
955
offset, data, end, index)
959
def _parse_segment(self, offset, data, end, index):
960
"""Parse one segment of data.
962
:param offset: Where 'data' begins in the file.
963
:param data: Some data to parse a segment of.
964
:param end: Where data ends
965
:param index: The current index into the parsed bytes map.
966
:return: True if the parsed segment is the last possible one in the
968
:return: high_parsed_byte, last_segment.
969
high_parsed_byte is the location of the highest parsed byte in this
970
segment, last_segment is True if the parsed segment is the last
971
possible one in the data block.
973
# default is to use all data
975
# accomodate overlap with data before this.
976
if offset < self._parsed_byte_map[index][1]:
977
# overlaps the lower parsed region
978
# skip the parsed data
979
trim_start = self._parsed_byte_map[index][1] - offset
980
# don't trim the start for \n
981
start_adjacent = True
982
elif offset == self._parsed_byte_map[index][1]:
983
# abuts the lower parsed region
986
# do not trim anything
987
start_adjacent = True
989
# does not overlap the lower parsed region
992
# but trim the leading \n
993
start_adjacent = False
994
if end == self._size:
995
# lines up to the end of all data:
998
# do not strip to the last \n
1001
elif index + 1 == len(self._parsed_byte_map):
1002
# at the end of the parsed data
1005
# but strip to the last \n
1006
end_adjacent = False
1008
elif end == self._parsed_byte_map[index + 1][0]:
1009
# buts up against the next parsed region
1012
# do not strip to the last \n
1015
elif end > self._parsed_byte_map[index + 1][0]:
1016
# overlaps into the next parsed region
1017
# only consider the unparsed data
1018
trim_end = self._parsed_byte_map[index + 1][0] - offset
1019
# do not strip to the last \n as we know its an entire record
1021
last_segment = end < self._parsed_byte_map[index + 1][1]
1023
# does not overlap into the next region
1026
# but strip to the last \n
1027
end_adjacent = False
1029
# now find bytes to discard if needed
1030
if not start_adjacent:
1031
# work around python bug in rfind
1032
if trim_start is None:
1033
trim_start = data.find('\n') + 1
1035
trim_start = data.find('\n', trim_start) + 1
1036
if not (trim_start != 0):
1037
raise AssertionError('no \n was present')
1038
# print 'removing start', offset, trim_start, repr(data[:trim_start])
1039
if not end_adjacent:
1040
# work around python bug in rfind
1041
if trim_end is None:
1042
trim_end = data.rfind('\n') + 1
1044
trim_end = data.rfind('\n', None, trim_end) + 1
1045
if not (trim_end != 0):
1046
raise AssertionError('no \n was present')
1047
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1048
# adjust offset and data to the parseable data.
1049
trimmed_data = data[trim_start:trim_end]
1050
if not (trimmed_data):
1051
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1052
% (trim_start, trim_end, offset, offset + len(data)))
1054
offset += trim_start
1055
# print "parsing", repr(trimmed_data)
1056
# splitlines mangles the \r delimiters.. don't use it.
1057
lines = trimmed_data.split('\n')
1060
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
1061
for key, value in nodes:
1062
self._bisect_nodes[key] = value
1063
self._parsed_bytes(offset, first_key,
1064
offset + len(trimmed_data), last_key)
1065
return offset + len(trimmed_data), last_segment
1067
def _parse_lines(self, lines, pos):
1074
# must be at the end
1076
if not (self._size == pos + 1):
1077
raise AssertionError("%s %s" % (self._size, pos))
1080
elements = line.split('\0')
1081
if len(elements) != self._expected_elements:
1082
raise errors.BadIndexData(self)
1083
# keys are tuples. Each element is a string that may occur many
1084
# times, so we intern them to save space. AB, RC, 200807
1085
key = tuple([intern(element) for element in elements[:self._key_length]])
1086
if first_key is None:
1088
absent, references, value = elements[-3:]
1090
for ref_string in references.split('\t'):
1091
ref_lists.append(tuple([
1092
int(ref) for ref in ref_string.split('\r') if ref
1094
ref_lists = tuple(ref_lists)
1095
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
1096
pos += len(line) + 1 # +1 for the \n
1099
if self.node_ref_lists:
1100
node_value = (value, ref_lists)
1103
nodes.append((key, node_value))
1104
# print "parsed ", key
1105
return first_key, key, nodes, trailers
1107
def _parsed_bytes(self, start, start_key, end, end_key):
1108
"""Mark the bytes from start to end as parsed.
1110
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
1113
:param start: The start of the parsed region.
1114
:param end: The end of the parsed region.
1116
index = self._parsed_byte_index(start)
1117
new_value = (start, end)
1118
new_key = (start_key, end_key)
1120
# first range parsed is always the beginning.
1121
self._parsed_byte_map.insert(index, new_value)
1122
self._parsed_key_map.insert(index, new_key)
1126
# extend lower region
1127
# extend higher region
1128
# combine two regions
1129
if (index + 1 < len(self._parsed_byte_map) and
1130
self._parsed_byte_map[index][1] == start and
1131
self._parsed_byte_map[index + 1][0] == end):
1132
# combine two regions
1133
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1134
self._parsed_byte_map[index + 1][1])
1135
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1136
self._parsed_key_map[index + 1][1])
1137
del self._parsed_byte_map[index + 1]
1138
del self._parsed_key_map[index + 1]
1139
elif self._parsed_byte_map[index][1] == start:
1140
# extend the lower entry
1141
self._parsed_byte_map[index] = (
1142
self._parsed_byte_map[index][0], end)
1143
self._parsed_key_map[index] = (
1144
self._parsed_key_map[index][0], end_key)
1145
elif (index + 1 < len(self._parsed_byte_map) and
1146
self._parsed_byte_map[index + 1][0] == end):
1147
# extend the higher entry
1148
self._parsed_byte_map[index + 1] = (
1149
start, self._parsed_byte_map[index + 1][1])
1150
self._parsed_key_map[index + 1] = (
1151
start_key, self._parsed_key_map[index + 1][1])
1154
self._parsed_byte_map.insert(index + 1, new_value)
1155
self._parsed_key_map.insert(index + 1, new_key)
1157
def _read_and_parse(self, readv_ranges):
1158
"""Read the the ranges and parse the resulting data.
1160
:param readv_ranges: A prepared readv range list.
1162
if not readv_ranges:
1164
if self._nodes is None and self._bytes_read * 2 >= self._size:
1165
# We've already read more than 50% of the file and we are about to
1166
# request more data, just _buffer_all() and be done
1170
readv_data = self._transport.readv(self._name, readv_ranges, True,
1173
for offset, data in readv_data:
1174
self._bytes_read += len(data)
1175
if offset == 0 and len(data) == self._size:
1176
# We read the whole range, most likely because the
1177
# Transport upcast our readv ranges into one long request
1178
# for enough total data to grab the whole index.
1179
self._buffer_all(StringIO(data))
1181
if self._bisect_nodes is None:
1182
# this must be the start
1183
if not (offset == 0):
1184
raise AssertionError()
1185
offset, data = self._parse_header_from_bytes(data)
1186
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1187
self._parse_region(offset, data)
1189
def _signature(self):
1190
"""The file signature for this index type."""
1194
"""Validate that everything in the index can be accessed."""
1195
# iter_all validates completely at the moment, so just do that.
1196
for node in self.iter_all_entries():
1200
class CombinedGraphIndex(object):
1201
"""A GraphIndex made up from smaller GraphIndices.
1203
The backing indices must implement GraphIndex, and are presumed to be
1206
Queries against the combined index will be made against the first index,
1207
and then the second and so on. The order of index's can thus influence
1208
performance significantly. For example, if one index is on local disk and a
1209
second on a remote server, the local disk index should be before the other
1213
def __init__(self, indices, reload_func=None):
1214
"""Create a CombinedGraphIndex backed by indices.
1216
:param indices: An ordered list of indices to query for data.
1217
:param reload_func: A function to call if we find we are missing an
1218
index. Should have the form reload_func() => True/False to indicate
1219
if reloading actually changed anything.
1221
self._indices = indices
1222
self._reload_func = reload_func
1226
self.__class__.__name__,
1227
', '.join(map(repr, self._indices)))
1229
def get_parent_map(self, keys):
1230
"""See graph.StackedParentsProvider.get_parent_map"""
1231
search_keys = set(keys)
1232
if NULL_REVISION in search_keys:
1233
search_keys.discard(NULL_REVISION)
1234
found_parents = {NULL_REVISION:[]}
1237
for index, key, value, refs in self.iter_entries(search_keys):
1240
parents = (NULL_REVISION,)
1241
found_parents[key] = parents
1242
return found_parents
1244
has_key = _has_key_from_parent_map
1246
def insert_index(self, pos, index):
1247
"""Insert a new index in the list of indices to query.
1249
:param pos: The position to insert the index.
1250
:param index: The index to insert.
1252
self._indices.insert(pos, index)
1254
def iter_all_entries(self):
1255
"""Iterate over all keys within the index
1257
Duplicate keys across child indices are presumed to have the same
1258
value and are only reported once.
1260
:return: An iterable of (index, key, reference_lists, value).
1261
There is no defined order for the result iteration - it will be in
1262
the most efficient order for the index.
1267
for index in self._indices:
1268
for node in index.iter_all_entries():
1269
if node[1] not in seen_keys:
1271
seen_keys.add(node[1])
1273
except errors.NoSuchFile:
1274
self._reload_or_raise()
1276
def iter_entries(self, keys):
1277
"""Iterate over keys within the index.
1279
Duplicate keys across child indices are presumed to have the same
1280
value and are only reported once.
1282
:param keys: An iterable providing the keys to be retrieved.
1283
:return: An iterable of (index, key, reference_lists, value). There is no
1284
defined order for the result iteration - it will be in the most
1285
efficient order for the index.
1290
for index in self._indices:
1293
for node in index.iter_entries(keys):
1294
keys.remove(node[1])
1297
except errors.NoSuchFile:
1298
self._reload_or_raise()
1300
def iter_entries_prefix(self, keys):
1301
"""Iterate over keys within the index using prefix matching.
1303
Duplicate keys across child indices are presumed to have the same
1304
value and are only reported once.
1306
Prefix matching is applied within the tuple of a key, not to within
1307
the bytestring of each key element. e.g. if you have the keys ('foo',
1308
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1309
only the former key is returned.
1311
:param keys: An iterable providing the key prefixes to be retrieved.
1312
Each key prefix takes the form of a tuple the length of a key, but
1313
with the last N elements 'None' rather than a regular bytestring.
1314
The first element cannot be 'None'.
1315
:return: An iterable as per iter_all_entries, but restricted to the
1316
keys with a matching prefix to those supplied. No additional keys
1317
will be returned, and every match that is in the index will be
1326
for index in self._indices:
1327
for node in index.iter_entries_prefix(keys):
1328
if node[1] in seen_keys:
1330
seen_keys.add(node[1])
1333
except errors.NoSuchFile:
1334
self._reload_or_raise()
1336
def find_ancestry(self, keys, ref_list_num):
1337
"""Find the complete ancestry for the given set of keys.
1339
Note that this is a whole-ancestry request, so it should be used
1342
:param keys: An iterable of keys to look for
1343
:param ref_list_num: The reference list which references the parents
1345
:return: (parent_map, missing_keys)
1347
missing_keys = set()
1349
keys_to_lookup = set(keys)
1351
while keys_to_lookup:
1352
# keys that *all* indexes claim are missing, stop searching them
1354
all_index_missing = None
1355
# print 'gen\tidx\tsub\tn_keys\tn_pmap\tn_miss'
1356
# print '%4d\t\t\t%4d\t%5d\t%5d' % (generation, len(keys_to_lookup),
1358
# len(missing_keys))
1359
for index_idx, index in enumerate(self._indices):
1360
# TODO: we should probably be doing something with
1361
# 'missing_keys' since we've already determined that
1362
# those revisions have not been found anywhere
1363
index_missing_keys = set()
1364
# Find all of the ancestry we can from this index
1365
# keep looking until the search_keys set is empty, which means
1366
# things we didn't find should be in index_missing_keys
1367
search_keys = keys_to_lookup
1369
# print ' \t%2d\t\t%4d\t%5d\t%5d' % (
1370
# index_idx, len(search_keys),
1371
# len(parent_map), len(index_missing_keys))
1374
# TODO: ref_list_num should really be a parameter, since
1375
# CombinedGraphIndex does not know what the ref lists
1377
search_keys = index._find_ancestors(search_keys,
1378
ref_list_num, parent_map, index_missing_keys)
1379
# print ' \t \t%2d\t%4d\t%5d\t%5d' % (
1380
# sub_generation, len(search_keys),
1381
# len(parent_map), len(index_missing_keys))
1382
# Now set whatever was missing to be searched in the next index
1383
keys_to_lookup = index_missing_keys
1384
if all_index_missing is None:
1385
all_index_missing = set(index_missing_keys)
1387
all_index_missing.intersection_update(index_missing_keys)
1388
if not keys_to_lookup:
1390
if all_index_missing is None:
1391
# There were no indexes, so all search keys are 'missing'
1392
missing_keys.update(keys_to_lookup)
1393
keys_to_lookup = None
1395
missing_keys.update(all_index_missing)
1396
keys_to_lookup.difference_update(all_index_missing)
1397
return parent_map, missing_keys
1399
def key_count(self):
1400
"""Return an estimate of the number of keys in this index.
1402
For CombinedGraphIndex this is approximated by the sum of the keys of
1403
the child indices. As child indices may have duplicate keys this can
1404
have a maximum error of the number of child indices * largest number of
1409
return sum((index.key_count() for index in self._indices), 0)
1410
except errors.NoSuchFile:
1411
self._reload_or_raise()
1413
missing_keys = _missing_keys_from_parent_map
1415
def _reload_or_raise(self):
1416
"""We just got a NoSuchFile exception.
1418
Try to reload the indices, if it fails, just raise the current
1421
if self._reload_func is None:
1423
exc_type, exc_value, exc_traceback = sys.exc_info()
1424
trace.mutter('Trying to reload after getting exception: %s',
1426
if not self._reload_func():
1427
# We tried to reload, but nothing changed, so we fail anyway
1428
trace.mutter('_reload_func indicated nothing has changed.'
1429
' Raising original exception.')
1430
raise exc_type, exc_value, exc_traceback
1433
"""Validate that everything in the index can be accessed."""
1436
for index in self._indices:
1439
except errors.NoSuchFile:
1440
self._reload_or_raise()
1443
class InMemoryGraphIndex(GraphIndexBuilder):
1444
"""A GraphIndex which operates entirely out of memory and is mutable.
1446
This is designed to allow the accumulation of GraphIndex entries during a
1447
single write operation, where the accumulated entries need to be immediately
1448
available - for example via a CombinedGraphIndex.
1451
def add_nodes(self, nodes):
1452
"""Add nodes to the index.
1454
:param nodes: An iterable of (key, node_refs, value) entries to add.
1456
if self.reference_lists:
1457
for (key, value, node_refs) in nodes:
1458
self.add_node(key, value, node_refs)
1460
for (key, value) in nodes:
1461
self.add_node(key, value)
1463
def iter_all_entries(self):
1464
"""Iterate over all keys within the index
1466
:return: An iterable of (index, key, reference_lists, value). There is no
1467
defined order for the result iteration - it will be in the most
1468
efficient order for the index (in this case dictionary hash order).
1470
if 'evil' in debug.debug_flags:
1471
trace.mutter_callsite(3,
1472
"iter_all_entries scales with size of history.")
1473
if self.reference_lists:
1474
for key, (absent, references, value) in self._nodes.iteritems():
1476
yield self, key, value, references
1478
for key, (absent, references, value) in self._nodes.iteritems():
1480
yield self, key, value
1482
def iter_entries(self, keys):
1483
"""Iterate over keys within the index.
1485
:param keys: An iterable providing the keys to be retrieved.
1486
:return: An iterable of (index, key, value, reference_lists). There is no
1487
defined order for the result iteration - it will be in the most
1488
efficient order for the index (keys iteration order in this case).
1491
if self.reference_lists:
1492
for key in keys.intersection(self._keys):
1493
node = self._nodes[key]
1495
yield self, key, node[2], node[1]
1497
for key in keys.intersection(self._keys):
1498
node = self._nodes[key]
1500
yield self, key, node[2]
1502
def iter_entries_prefix(self, keys):
1503
"""Iterate over keys within the index using prefix matching.
1505
Prefix matching is applied within the tuple of a key, not to within
1506
the bytestring of each key element. e.g. if you have the keys ('foo',
1507
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1508
only the former key is returned.
1510
:param keys: An iterable providing the key prefixes to be retrieved.
1511
Each key prefix takes the form of a tuple the length of a key, but
1512
with the last N elements 'None' rather than a regular bytestring.
1513
The first element cannot be 'None'.
1514
:return: An iterable as per iter_all_entries, but restricted to the
1515
keys with a matching prefix to those supplied. No additional keys
1516
will be returned, and every match that is in the index will be
1519
# XXX: To much duplication with the GraphIndex class; consider finding
1520
# a good place to pull out the actual common logic.
1524
if self._key_length == 1:
1528
raise errors.BadIndexKey(key)
1529
if len(key) != self._key_length:
1530
raise errors.BadIndexKey(key)
1531
node = self._nodes[key]
1534
if self.reference_lists:
1535
yield self, key, node[2], node[1]
1537
yield self, key, node[2]
1539
nodes_by_key = self._get_nodes_by_key()
1543
raise errors.BadIndexKey(key)
1544
if len(key) != self._key_length:
1545
raise errors.BadIndexKey(key)
1546
# find what it refers to:
1547
key_dict = nodes_by_key
1548
elements = list(key)
1549
# find the subdict to return
1551
while len(elements) and elements[0] is not None:
1552
key_dict = key_dict[elements[0]]
1555
# a non-existant lookup.
1560
key_dict = dicts.pop(-1)
1561
# can't be empty or would not exist
1562
item, value = key_dict.iteritems().next()
1563
if type(value) == dict:
1565
dicts.extend(key_dict.itervalues())
1568
for value in key_dict.itervalues():
1569
yield (self, ) + value
1571
yield (self, ) + key_dict
1573
def key_count(self):
1574
"""Return an estimate of the number of keys in this index.
1576
For InMemoryGraphIndex the estimate is exact.
1578
return len(self._keys)
1581
"""In memory index's have no known corruption at the moment."""
1584
class GraphIndexPrefixAdapter(object):
1585
"""An adapter between GraphIndex with different key lengths.
1587
Queries against this will emit queries against the adapted Graph with the
1588
prefix added, queries for all items use iter_entries_prefix. The returned
1589
nodes will have their keys and node references adjusted to remove the
1590
prefix. Finally, an add_nodes_callback can be supplied - when called the
1591
nodes and references being added will have prefix prepended.
1594
def __init__(self, adapted, prefix, missing_key_length,
1595
add_nodes_callback=None):
1596
"""Construct an adapter against adapted with prefix."""
1597
self.adapted = adapted
1598
self.prefix_key = prefix + (None,)*missing_key_length
1599
self.prefix = prefix
1600
self.prefix_len = len(prefix)
1601
self.add_nodes_callback = add_nodes_callback
1603
def add_nodes(self, nodes):
1604
"""Add nodes to the index.
1606
:param nodes: An iterable of (key, node_refs, value) entries to add.
1608
# save nodes in case its an iterator
1609
nodes = tuple(nodes)
1610
translated_nodes = []
1612
# Add prefix_key to each reference node_refs is a tuple of tuples,
1613
# so split it apart, and add prefix_key to the internal reference
1614
for (key, value, node_refs) in nodes:
1615
adjusted_references = (
1616
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1617
for ref_list in node_refs))
1618
translated_nodes.append((self.prefix + key, value,
1619
adjusted_references))
1621
# XXX: TODO add an explicit interface for getting the reference list
1622
# status, to handle this bit of user-friendliness in the API more
1624
for (key, value) in nodes:
1625
translated_nodes.append((self.prefix + key, value))
1626
self.add_nodes_callback(translated_nodes)
1628
def add_node(self, key, value, references=()):
1629
"""Add a node to the index.
1631
:param key: The key. keys are non-empty tuples containing
1632
as many whitespace-free utf8 bytestrings as the key length
1633
defined for this index.
1634
:param references: An iterable of iterables of keys. Each is a
1635
reference to another key.
1636
:param value: The value to associate with the key. It may be any
1637
bytes as long as it does not contain \0 or \n.
1639
self.add_nodes(((key, value, references), ))
1641
def _strip_prefix(self, an_iter):
1642
"""Strip prefix data from nodes and return it."""
1643
for node in an_iter:
1645
if node[1][:self.prefix_len] != self.prefix:
1646
raise errors.BadIndexData(self)
1647
for ref_list in node[3]:
1648
for ref_node in ref_list:
1649
if ref_node[:self.prefix_len] != self.prefix:
1650
raise errors.BadIndexData(self)
1651
yield node[0], node[1][self.prefix_len:], node[2], (
1652
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1653
for ref_list in node[3]))
1655
def iter_all_entries(self):
1656
"""Iterate over all keys within the index
1658
iter_all_entries is implemented against the adapted index using
1659
iter_entries_prefix.
1661
:return: An iterable of (index, key, reference_lists, value). There is no
1662
defined order for the result iteration - it will be in the most
1663
efficient order for the index (in this case dictionary hash order).
1665
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1667
def iter_entries(self, keys):
1668
"""Iterate over keys within the index.
1670
:param keys: An iterable providing the keys to be retrieved.
1671
:return: An iterable of (index, key, value, reference_lists). There is no
1672
defined order for the result iteration - it will be in the most
1673
efficient order for the index (keys iteration order in this case).
1675
return self._strip_prefix(self.adapted.iter_entries(
1676
self.prefix + key for key in keys))
1678
def iter_entries_prefix(self, keys):
1679
"""Iterate over keys within the index using prefix matching.
1681
Prefix matching is applied within the tuple of a key, not to within
1682
the bytestring of each key element. e.g. if you have the keys ('foo',
1683
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1684
only the former key is returned.
1686
:param keys: An iterable providing the key prefixes to be retrieved.
1687
Each key prefix takes the form of a tuple the length of a key, but
1688
with the last N elements 'None' rather than a regular bytestring.
1689
The first element cannot be 'None'.
1690
:return: An iterable as per iter_all_entries, but restricted to the
1691
keys with a matching prefix to those supplied. No additional keys
1692
will be returned, and every match that is in the index will be
1695
return self._strip_prefix(self.adapted.iter_entries_prefix(
1696
self.prefix + key for key in keys))
1698
def key_count(self):
1699
"""Return an estimate of the number of keys in this index.
1701
For GraphIndexPrefixAdapter this is relatively expensive - key
1702
iteration with the prefix is done.
1704
return len(list(self.iter_all_entries()))
1707
"""Call the adapted's validate."""
1708
self.adapted.validate()