1
# Copyright (C) 2007, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
"""Indexing facilities."""
23
'GraphIndexPrefixAdapter',
27
from bisect import bisect_right
28
from cStringIO import StringIO
32
from bzrlib.lazy_import import lazy_import
33
lazy_import(globals(), """
34
from bzrlib import trace
35
from bzrlib.bisect_multi import bisect_multi_bytes
36
from bzrlib.revision import NULL_REVISION
37
from bzrlib.trace import mutter
43
from bzrlib._static_tuple_c import StaticTuple
45
_HEADER_READV = (0, 200)
46
_OPTION_KEY_ELEMENTS = "key_elements="
48
_OPTION_NODE_REFS = "node_ref_lists="
49
_SIGNATURE = "Bazaar Graph Index 1\n"
52
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
53
_newline_null_re = re.compile('[\n\0]')
56
def _has_key_from_parent_map(self, key):
57
"""Check if this index has one key.
59
If it's possible to check for multiple keys at once through
60
calling get_parent_map that should be faster.
62
return (key in self.get_parent_map([key]))
65
def _missing_keys_from_parent_map(self, keys):
66
return set(keys) - set(self.get_parent_map(keys))
69
class GraphIndexBuilder(object):
70
"""A builder that can build a GraphIndex.
72
The resulting graph has the structure:
74
_SIGNATURE OPTIONS NODES NEWLINE
75
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
76
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
78
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
79
KEY := Not-whitespace-utf8
81
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
82
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
83
REFERENCE := DIGITS ; digits is the byte offset in the index of the
85
VALUE := no-newline-no-null-bytes
88
def __init__(self, reference_lists=0, key_elements=1):
89
"""Create a GraphIndex builder.
91
:param reference_lists: The number of node references lists for each
93
:param key_elements: The number of bytestrings in each key.
95
self.reference_lists = reference_lists
97
# A dict of {key: (absent, ref_lists, value)}
99
self._nodes_by_key = None
100
self._key_length = key_elements
101
self._optimize_for_size = False
102
self._combine_backing_indices = True
104
def _check_key(self, key):
105
"""Raise BadIndexKey if key is not a valid key for this index."""
106
if type(key) not in (tuple, StaticTuple):
107
raise errors.BadIndexKey(key)
108
if self._key_length != len(key):
109
raise errors.BadIndexKey(key)
111
if not element or _whitespace_re.search(element) is not None:
112
raise errors.BadIndexKey(element)
114
def _external_references(self):
115
"""Return references that are not present in this index.
119
# TODO: JAM 2008-11-21 This makes an assumption about how the reference
120
# lists are used. It is currently correct for pack-0.92 through
121
# 1.9, which use the node references (3rd column) second
122
# reference list as the compression parent. Perhaps this should
123
# be moved into something higher up the stack, since it
124
# makes assumptions about how the index is used.
125
if self.reference_lists > 1:
126
for node in self.iter_all_entries():
128
refs.update(node[3][1])
131
# If reference_lists == 0 there can be no external references, and
132
# if reference_lists == 1, then there isn't a place to store the
136
def _get_nodes_by_key(self):
137
if self._nodes_by_key is None:
139
if self.reference_lists:
140
for key, (absent, references, value) in self._nodes.iteritems():
143
key_dict = nodes_by_key
144
for subkey in key[:-1]:
145
key_dict = key_dict.setdefault(subkey, {})
146
key_dict[key[-1]] = key, value, references
148
for key, (absent, references, value) in self._nodes.iteritems():
151
key_dict = nodes_by_key
152
for subkey in key[:-1]:
153
key_dict = key_dict.setdefault(subkey, {})
154
key_dict[key[-1]] = key, value
155
self._nodes_by_key = nodes_by_key
156
return self._nodes_by_key
158
def _update_nodes_by_key(self, key, value, node_refs):
159
"""Update the _nodes_by_key dict with a new key.
161
For a key of (foo, bar, baz) create
162
_nodes_by_key[foo][bar][baz] = key_value
164
if self._nodes_by_key is None:
166
key_dict = self._nodes_by_key
167
if self.reference_lists:
168
key_value = key, value, node_refs
170
key_value = key, value
171
for subkey in key[:-1]:
172
key_dict = key_dict.setdefault(subkey, {})
173
key_dict[key[-1]] = key_value
175
def _check_key_ref_value(self, key, references, value):
176
"""Check that 'key' and 'references' are all valid.
178
:param key: A key tuple. Must conform to the key interface (be a tuple,
179
be of the right length, not have any whitespace or nulls in any key
181
:param references: An iterable of reference lists. Something like
182
[[(ref, key)], [(ref, key), (other, key)]]
183
:param value: The value associate with this key. Must not contain
184
newlines or null characters.
185
:return: (node_refs, absent_references)
186
node_refs basically a packed form of 'references' where all
188
absent_references reference keys that are not in self._nodes.
189
This may contain duplicates if the same key is
190
referenced in multiple lists.
193
if _newline_null_re.search(value) is not None:
194
raise errors.BadIndexValue(value)
195
if len(references) != self.reference_lists:
196
raise errors.BadIndexValue(references)
198
absent_references = []
199
for reference_list in references:
200
for reference in reference_list:
201
# If reference *is* in self._nodes, then we know it has already
203
if reference not in self._nodes:
204
self._check_key(reference)
205
absent_references.append(reference)
206
node_refs.append(tuple(reference_list))
207
return tuple(node_refs), absent_references
209
def add_node(self, key, value, references=()):
210
"""Add a node to the index.
212
:param key: The key. keys are non-empty tuples containing
213
as many whitespace-free utf8 bytestrings as the key length
214
defined for this index.
215
:param references: An iterable of iterables of keys. Each is a
216
reference to another key.
217
:param value: The value to associate with the key. It may be any
218
bytes as long as it does not contain \0 or \n.
221
absent_references) = self._check_key_ref_value(key, references, value)
222
if key in self._nodes and self._nodes[key][0] != 'a':
223
raise errors.BadIndexDuplicateKey(key, self)
224
for reference in absent_references:
225
# There may be duplicates, but I don't think it is worth worrying
227
self._nodes[reference] = ('a', (), '')
228
self._nodes[key] = ('', node_refs, value)
230
if self._nodes_by_key is not None and self._key_length > 1:
231
self._update_nodes_by_key(key, value, node_refs)
235
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
236
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
237
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
238
prefix_length = sum(len(x) for x in lines)
239
# references are byte offsets. To avoid having to do nasty
240
# polynomial work to resolve offsets (references to later in the
241
# file cannot be determined until all the inbetween references have
242
# been calculated too) we pad the offsets with 0's to make them be
243
# of consistent length. Using binary offsets would break the trivial
245
# to calculate the width of zero's needed we do three passes:
246
# one to gather all the non-reference data and the number of references.
247
# one to pad all the data with reference-length and determine entry
251
# forward sorted by key. In future we may consider topological sorting,
252
# at the cost of table scans for direct lookup, or a second index for
254
nodes = sorted(self._nodes.items())
255
# if we do not prepass, we don't know how long it will be up front.
256
expected_bytes = None
257
# we only need to pre-pass if we have reference lists at all.
258
if self.reference_lists:
260
non_ref_bytes = prefix_length
262
# TODO use simple multiplication for the constants in this loop.
263
for key, (absent, references, value) in nodes:
264
# record the offset known *so far* for this key:
265
# the non reference bytes to date, and the total references to
266
# date - saves reaccumulating on the second pass
267
key_offset_info.append((key, non_ref_bytes, total_references))
268
# key is literal, value is literal, there are 3 null's, 1 NL
269
# key is variable length tuple, \x00 between elements
270
non_ref_bytes += sum(len(element) for element in key)
271
if self._key_length > 1:
272
non_ref_bytes += self._key_length - 1
273
# value is literal bytes, there are 3 null's, 1 NL.
274
non_ref_bytes += len(value) + 3 + 1
275
# one byte for absent if set.
278
elif self.reference_lists:
279
# (ref_lists -1) tabs
280
non_ref_bytes += self.reference_lists - 1
281
# (ref-1 cr's per ref_list)
282
for ref_list in references:
283
# how many references across the whole file?
284
total_references += len(ref_list)
285
# accrue reference separators
287
non_ref_bytes += len(ref_list) - 1
288
# how many digits are needed to represent the total byte count?
290
possible_total_bytes = non_ref_bytes + total_references*digits
291
while 10 ** digits < possible_total_bytes:
293
possible_total_bytes = non_ref_bytes + total_references*digits
294
expected_bytes = possible_total_bytes + 1 # terminating newline
295
# resolve key addresses.
297
for key, non_ref_bytes, total_references in key_offset_info:
298
key_addresses[key] = non_ref_bytes + total_references*digits
300
format_string = '%%0%sd' % digits
301
for key, (absent, references, value) in nodes:
302
flattened_references = []
303
for ref_list in references:
305
for reference in ref_list:
306
ref_addresses.append(format_string % key_addresses[reference])
307
flattened_references.append('\r'.join(ref_addresses))
308
string_key = '\x00'.join(key)
309
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
310
'\t'.join(flattened_references), value))
312
result = StringIO(''.join(lines))
313
if expected_bytes and len(result.getvalue()) != expected_bytes:
314
raise errors.BzrError('Failed index creation. Internal error:'
315
' mismatched output length and expected length: %d %d' %
316
(len(result.getvalue()), expected_bytes))
319
def set_optimize(self, for_size=None, combine_backing_indices=None):
320
"""Change how the builder tries to optimize the result.
322
:param for_size: Tell the builder to try and make the index as small as
324
:param combine_backing_indices: If the builder spills to disk to save
325
memory, should the on-disk indices be combined. Set to True if you
326
are going to be probing the index, but to False if you are not. (If
327
you are not querying, then the time spent combining is wasted.)
330
# GraphIndexBuilder itself doesn't pay attention to the flag yet, but
332
if for_size is not None:
333
self._optimize_for_size = for_size
334
if combine_backing_indices is not None:
335
self._combine_backing_indices = combine_backing_indices
337
def find_ancestry(self, keys, ref_list_num):
338
"""See CombinedGraphIndex.find_ancestry()"""
344
for _, key, value, ref_lists in self.iter_entries(pending):
345
parent_keys = ref_lists[ref_list_num]
346
parent_map[key] = parent_keys
347
next_pending.update([p for p in parent_keys if p not in
349
missing_keys.update(pending.difference(parent_map))
350
pending = next_pending
351
return parent_map, missing_keys
354
class GraphIndex(object):
355
"""An index for data with embedded graphs.
357
The index maps keys to a list of key reference lists, and a value.
358
Each node has the same number of key reference lists. Each key reference
359
list can be empty or an arbitrary length. The value is an opaque NULL
360
terminated string without any newlines. The storage of the index is
361
hidden in the interface: keys and key references are always tuples of
362
bytestrings, never the internal representation (e.g. dictionary offsets).
364
It is presumed that the index will not be mutated - it is static data.
366
Successive iter_all_entries calls will read the entire index each time.
367
Additionally, iter_entries calls will read the index linearly until the
368
desired keys are found. XXX: This must be fixed before the index is
369
suitable for production use. :XXX
372
def __init__(self, transport, name, size):
373
"""Open an index called name on transport.
375
:param transport: A bzrlib.transport.Transport.
376
:param name: A path to provide to transport API calls.
377
:param size: The size of the index in bytes. This is used for bisection
378
logic to perform partial index reads. While the size could be
379
obtained by statting the file this introduced an additional round
380
trip as well as requiring stat'able transports, both of which are
381
avoided by having it supplied. If size is None, then bisection
382
support will be disabled and accessing the index will just stream
385
self._transport = transport
387
# Becomes a dict of key:(value, reference-list-byte-locations) used by
388
# the bisection interface to store parsed but not resolved keys.
389
self._bisect_nodes = None
390
# Becomes a dict of key:(value, reference-list-keys) which are ready to
391
# be returned directly to callers.
393
# a sorted list of slice-addresses for the parsed bytes of the file.
394
# e.g. (0,1) would mean that byte 0 is parsed.
395
self._parsed_byte_map = []
396
# a sorted list of keys matching each slice address for parsed bytes
397
# e.g. (None, 'foo@bar') would mean that the first byte contained no
398
# key, and the end byte of the slice is the of the data for 'foo@bar'
399
self._parsed_key_map = []
400
self._key_count = None
401
self._keys_by_offset = None
402
self._nodes_by_key = None
404
# The number of bytes we've read so far in trying to process this file
407
def __eq__(self, other):
408
"""Equal when self and other were created with the same parameters."""
410
type(self) == type(other) and
411
self._transport == other._transport and
412
self._name == other._name and
413
self._size == other._size)
415
def __ne__(self, other):
416
return not self.__eq__(other)
419
return "%s(%r)" % (self.__class__.__name__,
420
self._transport.abspath(self._name))
422
def _buffer_all(self, stream=None):
423
"""Buffer all the index data.
425
Mutates self._nodes and self.keys_by_offset.
427
if self._nodes is not None:
428
# We already did this
430
if 'index' in debug.debug_flags:
431
mutter('Reading entire index %s', self._transport.abspath(self._name))
433
stream = self._transport.get(self._name)
434
self._read_prefix(stream)
435
self._expected_elements = 3 + self._key_length
437
# raw data keyed by offset
438
self._keys_by_offset = {}
439
# ready-to-return key:value or key:value, node_ref_lists
441
self._nodes_by_key = None
444
lines = stream.read().split('\n')
446
_, _, _, trailers = self._parse_lines(lines, pos)
447
for key, absent, references, value in self._keys_by_offset.itervalues():
450
# resolve references:
451
if self.node_ref_lists:
452
node_value = (value, self._resolve_references(references))
455
self._nodes[key] = node_value
456
# cache the keys for quick set intersections
457
self._keys = set(self._nodes)
459
# there must be one line - the empty trailer line.
460
raise errors.BadIndexData(self)
462
def external_references(self, ref_list_num):
463
"""Return references that are not present in this index.
466
if ref_list_num + 1 > self.node_ref_lists:
467
raise ValueError('No ref list %d, index has %d ref lists'
468
% (ref_list_num, self.node_ref_lists))
470
for key, (value, ref_lists) in self._nodes.iteritems():
471
ref_list = ref_lists[ref_list_num]
472
refs.update(ref_list)
473
return refs - self._keys
475
def _get_nodes_by_key(self):
476
if self._nodes_by_key is None:
478
if self.node_ref_lists:
479
for key, (value, references) in self._nodes.iteritems():
480
key_dict = nodes_by_key
481
for subkey in key[:-1]:
482
key_dict = key_dict.setdefault(subkey, {})
483
key_dict[key[-1]] = key, value, references
485
for key, value in self._nodes.iteritems():
486
key_dict = nodes_by_key
487
for subkey in key[:-1]:
488
key_dict = key_dict.setdefault(subkey, {})
489
key_dict[key[-1]] = key, value
490
self._nodes_by_key = nodes_by_key
491
return self._nodes_by_key
493
def iter_all_entries(self):
494
"""Iterate over all keys within the index.
496
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
497
The former tuple is used when there are no reference lists in the
498
index, making the API compatible with simple key:value index types.
499
There is no defined order for the result iteration - it will be in
500
the most efficient order for the index.
502
if 'evil' in debug.debug_flags:
503
trace.mutter_callsite(3,
504
"iter_all_entries scales with size of history.")
505
if self._nodes is None:
507
if self.node_ref_lists:
508
for key, (value, node_ref_lists) in self._nodes.iteritems():
509
yield self, key, value, node_ref_lists
511
for key, value in self._nodes.iteritems():
512
yield self, key, value
514
def _read_prefix(self, stream):
515
signature = stream.read(len(self._signature()))
516
if not signature == self._signature():
517
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
518
options_line = stream.readline()
519
if not options_line.startswith(_OPTION_NODE_REFS):
520
raise errors.BadIndexOptions(self)
522
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
524
raise errors.BadIndexOptions(self)
525
options_line = stream.readline()
526
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
527
raise errors.BadIndexOptions(self)
529
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
531
raise errors.BadIndexOptions(self)
532
options_line = stream.readline()
533
if not options_line.startswith(_OPTION_LEN):
534
raise errors.BadIndexOptions(self)
536
self._key_count = int(options_line[len(_OPTION_LEN):-1])
538
raise errors.BadIndexOptions(self)
540
def _resolve_references(self, references):
541
"""Return the resolved key references for references.
543
References are resolved by looking up the location of the key in the
544
_keys_by_offset map and substituting the key name, preserving ordering.
546
:param references: An iterable of iterables of key locations. e.g.
548
:return: A tuple of tuples of keys.
551
for ref_list in references:
552
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
553
return tuple(node_refs)
555
def _find_index(self, range_map, key):
556
"""Helper for the _parsed_*_index calls.
558
Given a range map - [(start, end), ...], finds the index of the range
559
in the map for key if it is in the map, and if it is not there, the
560
immediately preceeding range in the map.
562
result = bisect_right(range_map, key) - 1
563
if result + 1 < len(range_map):
564
# check the border condition, it may be in result + 1
565
if range_map[result + 1][0] == key[0]:
569
def _parsed_byte_index(self, offset):
570
"""Return the index of the entry immediately before offset.
572
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
573
there is one unparsed byte (the 11th, addressed as[10]). then:
574
asking for 0 will return 0
575
asking for 10 will return 0
576
asking for 11 will return 1
577
asking for 12 will return 1
580
return self._find_index(self._parsed_byte_map, key)
582
def _parsed_key_index(self, key):
583
"""Return the index of the entry immediately before key.
585
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
586
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
587
have been parsed, then:
588
asking for '' will return 0
589
asking for 'a' will return 0
590
asking for 'b' will return 1
591
asking for 'e' will return 1
593
search_key = (key, None)
594
return self._find_index(self._parsed_key_map, search_key)
596
def _is_parsed(self, offset):
597
"""Returns True if offset has been parsed."""
598
index = self._parsed_byte_index(offset)
599
if index == len(self._parsed_byte_map):
600
return offset < self._parsed_byte_map[index - 1][1]
601
start, end = self._parsed_byte_map[index]
602
return offset >= start and offset < end
604
def _iter_entries_from_total_buffer(self, keys):
605
"""Iterate over keys when the entire index is parsed."""
606
keys = keys.intersection(self._keys)
607
if self.node_ref_lists:
609
value, node_refs = self._nodes[key]
610
yield self, key, value, node_refs
613
yield self, key, self._nodes[key]
615
def iter_entries(self, keys):
616
"""Iterate over keys within the index.
618
:param keys: An iterable providing the keys to be retrieved.
619
:return: An iterable as per iter_all_entries, but restricted to the
620
keys supplied. No additional keys will be returned, and every
621
key supplied that is in the index will be returned.
626
if self._size is None and self._nodes is None:
629
# We fit about 20 keys per minimum-read (4K), so if we are looking for
630
# more than 1/20th of the index its likely (assuming homogenous key
631
# spread) that we'll read the entire index. If we're going to do that,
632
# buffer the whole thing. A better analysis might take key spread into
633
# account - but B+Tree indices are better anyway.
634
# We could look at all data read, and use a threshold there, which will
635
# trigger on ancestry walks, but that is not yet fully mapped out.
636
if self._nodes is None and len(keys) * 20 > self.key_count():
638
if self._nodes is not None:
639
return self._iter_entries_from_total_buffer(keys)
641
return (result[1] for result in bisect_multi_bytes(
642
self._lookup_keys_via_location, self._size, keys))
644
def iter_entries_prefix(self, keys):
645
"""Iterate over keys within the index using prefix matching.
647
Prefix matching is applied within the tuple of a key, not to within
648
the bytestring of each key element. e.g. if you have the keys ('foo',
649
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
650
only the former key is returned.
652
WARNING: Note that this method currently causes a full index parse
653
unconditionally (which is reasonably appropriate as it is a means for
654
thunking many small indices into one larger one and still supplies
655
iter_all_entries at the thunk layer).
657
:param keys: An iterable providing the key prefixes to be retrieved.
658
Each key prefix takes the form of a tuple the length of a key, but
659
with the last N elements 'None' rather than a regular bytestring.
660
The first element cannot be 'None'.
661
:return: An iterable as per iter_all_entries, but restricted to the
662
keys with a matching prefix to those supplied. No additional keys
663
will be returned, and every match that is in the index will be
669
# load data - also finds key lengths
670
if self._nodes is None:
672
if self._key_length == 1:
676
raise errors.BadIndexKey(key)
677
if len(key) != self._key_length:
678
raise errors.BadIndexKey(key)
679
if self.node_ref_lists:
680
value, node_refs = self._nodes[key]
681
yield self, key, value, node_refs
683
yield self, key, self._nodes[key]
685
nodes_by_key = self._get_nodes_by_key()
689
raise errors.BadIndexKey(key)
690
if len(key) != self._key_length:
691
raise errors.BadIndexKey(key)
692
# find what it refers to:
693
key_dict = nodes_by_key
695
# find the subdict whose contents should be returned.
697
while len(elements) and elements[0] is not None:
698
key_dict = key_dict[elements[0]]
701
# a non-existant lookup.
706
key_dict = dicts.pop(-1)
707
# can't be empty or would not exist
708
item, value = key_dict.iteritems().next()
709
if type(value) == dict:
711
dicts.extend(key_dict.itervalues())
714
for value in key_dict.itervalues():
715
# each value is the key:value:node refs tuple
717
yield (self, ) + value
719
# the last thing looked up was a terminal element
720
yield (self, ) + key_dict
722
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
723
"""See BTreeIndex._find_ancestors."""
724
# The api can be implemented as a trivial overlay on top of
725
# iter_entries, it is not an efficient implementation, but it at least
729
for index, key, value, refs in self.iter_entries(keys):
730
parent_keys = refs[ref_list_num]
732
parent_map[key] = parent_keys
733
search_keys.update(parent_keys)
734
# Figure out what, if anything, was missing
735
missing_keys.update(set(keys).difference(found_keys))
736
search_keys = search_keys.difference(parent_map)
740
"""Return an estimate of the number of keys in this index.
742
For GraphIndex the estimate is exact.
744
if self._key_count is None:
745
self._read_and_parse([_HEADER_READV])
746
return self._key_count
748
def _lookup_keys_via_location(self, location_keys):
749
"""Public interface for implementing bisection.
751
If _buffer_all has been called, then all the data for the index is in
752
memory, and this method should not be called, as it uses a separate
753
cache because it cannot pre-resolve all indices, which buffer_all does
756
:param location_keys: A list of location(byte offset), key tuples.
757
:return: A list of (location_key, result) tuples as expected by
758
bzrlib.bisect_multi.bisect_multi_bytes.
760
# Possible improvements:
761
# - only bisect lookup each key once
762
# - sort the keys first, and use that to reduce the bisection window
764
# this progresses in three parts:
767
# attempt to answer the question from the now in memory data.
768
# build the readv request
769
# for each location, ask for 800 bytes - much more than rows we've seen
772
for location, key in location_keys:
773
# can we answer from cache?
774
if self._bisect_nodes and key in self._bisect_nodes:
775
# We have the key parsed.
777
index = self._parsed_key_index(key)
778
if (len(self._parsed_key_map) and
779
self._parsed_key_map[index][0] <= key and
780
(self._parsed_key_map[index][1] >= key or
781
# end of the file has been parsed
782
self._parsed_byte_map[index][1] == self._size)):
783
# the key has been parsed, so no lookup is needed even if its
786
# - if we have examined this part of the file already - yes
787
index = self._parsed_byte_index(location)
788
if (len(self._parsed_byte_map) and
789
self._parsed_byte_map[index][0] <= location and
790
self._parsed_byte_map[index][1] > location):
791
# the byte region has been parsed, so no read is needed.
794
if location + length > self._size:
795
length = self._size - location
796
# todo, trim out parsed locations.
798
readv_ranges.append((location, length))
799
# read the header if needed
800
if self._bisect_nodes is None:
801
readv_ranges.append(_HEADER_READV)
802
self._read_and_parse(readv_ranges)
804
if self._nodes is not None:
805
# _read_and_parse triggered a _buffer_all because we requested the
807
for location, key in location_keys:
808
if key not in self._nodes: # not present
809
result.append(((location, key), False))
810
elif self.node_ref_lists:
811
value, refs = self._nodes[key]
812
result.append(((location, key),
813
(self, key, value, refs)))
815
result.append(((location, key),
816
(self, key, self._nodes[key])))
819
# - figure out <, >, missing, present
820
# - result present references so we can return them.
821
# keys that we cannot answer until we resolve references
822
pending_references = []
823
pending_locations = set()
824
for location, key in location_keys:
825
# can we answer from cache?
826
if key in self._bisect_nodes:
827
# the key has been parsed, so no lookup is needed
828
if self.node_ref_lists:
829
# the references may not have been all parsed.
830
value, refs = self._bisect_nodes[key]
831
wanted_locations = []
832
for ref_list in refs:
834
if ref not in self._keys_by_offset:
835
wanted_locations.append(ref)
837
pending_locations.update(wanted_locations)
838
pending_references.append((location, key))
840
result.append(((location, key), (self, key,
841
value, self._resolve_references(refs))))
843
result.append(((location, key),
844
(self, key, self._bisect_nodes[key])))
847
# has the region the key should be in, been parsed?
848
index = self._parsed_key_index(key)
849
if (self._parsed_key_map[index][0] <= key and
850
(self._parsed_key_map[index][1] >= key or
851
# end of the file has been parsed
852
self._parsed_byte_map[index][1] == self._size)):
853
result.append(((location, key), False))
855
# no, is the key above or below the probed location:
856
# get the range of the probed & parsed location
857
index = self._parsed_byte_index(location)
858
# if the key is below the start of the range, its below
859
if key < self._parsed_key_map[index][0]:
863
result.append(((location, key), direction))
865
# lookup data to resolve references
866
for location in pending_locations:
868
if location + length > self._size:
869
length = self._size - location
870
# TODO: trim out parsed locations (e.g. if the 800 is into the
871
# parsed region trim it, and dont use the adjust_for_latency
874
readv_ranges.append((location, length))
875
self._read_and_parse(readv_ranges)
876
if self._nodes is not None:
877
# The _read_and_parse triggered a _buffer_all, grab the data and
879
for location, key in pending_references:
880
value, refs = self._nodes[key]
881
result.append(((location, key), (self, key, value, refs)))
883
for location, key in pending_references:
884
# answer key references we had to look-up-late.
885
value, refs = self._bisect_nodes[key]
886
result.append(((location, key), (self, key,
887
value, self._resolve_references(refs))))
890
def _parse_header_from_bytes(self, bytes):
891
"""Parse the header from a region of bytes.
893
:param bytes: The data to parse.
894
:return: An offset, data tuple such as readv yields, for the unparsed
895
data. (which may length 0).
897
signature = bytes[0:len(self._signature())]
898
if not signature == self._signature():
899
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
900
lines = bytes[len(self._signature()):].splitlines()
901
options_line = lines[0]
902
if not options_line.startswith(_OPTION_NODE_REFS):
903
raise errors.BadIndexOptions(self)
905
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
907
raise errors.BadIndexOptions(self)
908
options_line = lines[1]
909
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
910
raise errors.BadIndexOptions(self)
912
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
914
raise errors.BadIndexOptions(self)
915
options_line = lines[2]
916
if not options_line.startswith(_OPTION_LEN):
917
raise errors.BadIndexOptions(self)
919
self._key_count = int(options_line[len(_OPTION_LEN):])
921
raise errors.BadIndexOptions(self)
922
# calculate the bytes we have processed
923
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
925
self._parsed_bytes(0, None, header_end, None)
926
# setup parsing state
927
self._expected_elements = 3 + self._key_length
928
# raw data keyed by offset
929
self._keys_by_offset = {}
930
# keys with the value and node references
931
self._bisect_nodes = {}
932
return header_end, bytes[header_end:]
934
def _parse_region(self, offset, data):
935
"""Parse node data returned from a readv operation.
937
:param offset: The byte offset the data starts at.
938
:param data: The data to parse.
942
end = offset + len(data)
945
# Trivial test - if the current index's end is within the
946
# low-matching parsed range, we're done.
947
index = self._parsed_byte_index(high_parsed)
948
if end < self._parsed_byte_map[index][1]:
950
# print "[%d:%d]" % (offset, end), \
951
# self._parsed_byte_map[index:index + 2]
952
high_parsed, last_segment = self._parse_segment(
953
offset, data, end, index)
957
def _parse_segment(self, offset, data, end, index):
958
"""Parse one segment of data.
960
:param offset: Where 'data' begins in the file.
961
:param data: Some data to parse a segment of.
962
:param end: Where data ends
963
:param index: The current index into the parsed bytes map.
964
:return: True if the parsed segment is the last possible one in the
966
:return: high_parsed_byte, last_segment.
967
high_parsed_byte is the location of the highest parsed byte in this
968
segment, last_segment is True if the parsed segment is the last
969
possible one in the data block.
971
# default is to use all data
973
# accomodate overlap with data before this.
974
if offset < self._parsed_byte_map[index][1]:
975
# overlaps the lower parsed region
976
# skip the parsed data
977
trim_start = self._parsed_byte_map[index][1] - offset
978
# don't trim the start for \n
979
start_adjacent = True
980
elif offset == self._parsed_byte_map[index][1]:
981
# abuts the lower parsed region
984
# do not trim anything
985
start_adjacent = True
987
# does not overlap the lower parsed region
990
# but trim the leading \n
991
start_adjacent = False
992
if end == self._size:
993
# lines up to the end of all data:
996
# do not strip to the last \n
999
elif index + 1 == len(self._parsed_byte_map):
1000
# at the end of the parsed data
1003
# but strip to the last \n
1004
end_adjacent = False
1006
elif end == self._parsed_byte_map[index + 1][0]:
1007
# buts up against the next parsed region
1010
# do not strip to the last \n
1013
elif end > self._parsed_byte_map[index + 1][0]:
1014
# overlaps into the next parsed region
1015
# only consider the unparsed data
1016
trim_end = self._parsed_byte_map[index + 1][0] - offset
1017
# do not strip to the last \n as we know its an entire record
1019
last_segment = end < self._parsed_byte_map[index + 1][1]
1021
# does not overlap into the next region
1024
# but strip to the last \n
1025
end_adjacent = False
1027
# now find bytes to discard if needed
1028
if not start_adjacent:
1029
# work around python bug in rfind
1030
if trim_start is None:
1031
trim_start = data.find('\n') + 1
1033
trim_start = data.find('\n', trim_start) + 1
1034
if not (trim_start != 0):
1035
raise AssertionError('no \n was present')
1036
# print 'removing start', offset, trim_start, repr(data[:trim_start])
1037
if not end_adjacent:
1038
# work around python bug in rfind
1039
if trim_end is None:
1040
trim_end = data.rfind('\n') + 1
1042
trim_end = data.rfind('\n', None, trim_end) + 1
1043
if not (trim_end != 0):
1044
raise AssertionError('no \n was present')
1045
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1046
# adjust offset and data to the parseable data.
1047
trimmed_data = data[trim_start:trim_end]
1048
if not (trimmed_data):
1049
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1050
% (trim_start, trim_end, offset, offset + len(data)))
1052
offset += trim_start
1053
# print "parsing", repr(trimmed_data)
1054
# splitlines mangles the \r delimiters.. don't use it.
1055
lines = trimmed_data.split('\n')
1058
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
1059
for key, value in nodes:
1060
self._bisect_nodes[key] = value
1061
self._parsed_bytes(offset, first_key,
1062
offset + len(trimmed_data), last_key)
1063
return offset + len(trimmed_data), last_segment
1065
def _parse_lines(self, lines, pos):
1072
# must be at the end
1074
if not (self._size == pos + 1):
1075
raise AssertionError("%s %s" % (self._size, pos))
1078
elements = line.split('\0')
1079
if len(elements) != self._expected_elements:
1080
raise errors.BadIndexData(self)
1081
# keys are tuples. Each element is a string that may occur many
1082
# times, so we intern them to save space. AB, RC, 200807
1083
key = tuple([intern(element) for element in elements[:self._key_length]])
1084
if first_key is None:
1086
absent, references, value = elements[-3:]
1088
for ref_string in references.split('\t'):
1089
ref_lists.append(tuple([
1090
int(ref) for ref in ref_string.split('\r') if ref
1092
ref_lists = tuple(ref_lists)
1093
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
1094
pos += len(line) + 1 # +1 for the \n
1097
if self.node_ref_lists:
1098
node_value = (value, ref_lists)
1101
nodes.append((key, node_value))
1102
# print "parsed ", key
1103
return first_key, key, nodes, trailers
1105
def _parsed_bytes(self, start, start_key, end, end_key):
1106
"""Mark the bytes from start to end as parsed.
1108
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
1111
:param start: The start of the parsed region.
1112
:param end: The end of the parsed region.
1114
index = self._parsed_byte_index(start)
1115
new_value = (start, end)
1116
new_key = (start_key, end_key)
1118
# first range parsed is always the beginning.
1119
self._parsed_byte_map.insert(index, new_value)
1120
self._parsed_key_map.insert(index, new_key)
1124
# extend lower region
1125
# extend higher region
1126
# combine two regions
1127
if (index + 1 < len(self._parsed_byte_map) and
1128
self._parsed_byte_map[index][1] == start and
1129
self._parsed_byte_map[index + 1][0] == end):
1130
# combine two regions
1131
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1132
self._parsed_byte_map[index + 1][1])
1133
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1134
self._parsed_key_map[index + 1][1])
1135
del self._parsed_byte_map[index + 1]
1136
del self._parsed_key_map[index + 1]
1137
elif self._parsed_byte_map[index][1] == start:
1138
# extend the lower entry
1139
self._parsed_byte_map[index] = (
1140
self._parsed_byte_map[index][0], end)
1141
self._parsed_key_map[index] = (
1142
self._parsed_key_map[index][0], end_key)
1143
elif (index + 1 < len(self._parsed_byte_map) and
1144
self._parsed_byte_map[index + 1][0] == end):
1145
# extend the higher entry
1146
self._parsed_byte_map[index + 1] = (
1147
start, self._parsed_byte_map[index + 1][1])
1148
self._parsed_key_map[index + 1] = (
1149
start_key, self._parsed_key_map[index + 1][1])
1152
self._parsed_byte_map.insert(index + 1, new_value)
1153
self._parsed_key_map.insert(index + 1, new_key)
1155
def _read_and_parse(self, readv_ranges):
1156
"""Read the the ranges and parse the resulting data.
1158
:param readv_ranges: A prepared readv range list.
1160
if not readv_ranges:
1162
if self._nodes is None and self._bytes_read * 2 >= self._size:
1163
# We've already read more than 50% of the file and we are about to
1164
# request more data, just _buffer_all() and be done
1168
readv_data = self._transport.readv(self._name, readv_ranges, True,
1171
for offset, data in readv_data:
1172
self._bytes_read += len(data)
1173
if offset == 0 and len(data) == self._size:
1174
# We read the whole range, most likely because the
1175
# Transport upcast our readv ranges into one long request
1176
# for enough total data to grab the whole index.
1177
self._buffer_all(StringIO(data))
1179
if self._bisect_nodes is None:
1180
# this must be the start
1181
if not (offset == 0):
1182
raise AssertionError()
1183
offset, data = self._parse_header_from_bytes(data)
1184
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1185
self._parse_region(offset, data)
1187
def _signature(self):
1188
"""The file signature for this index type."""
1192
"""Validate that everything in the index can be accessed."""
1193
# iter_all validates completely at the moment, so just do that.
1194
for node in self.iter_all_entries():
1198
class CombinedGraphIndex(object):
1199
"""A GraphIndex made up from smaller GraphIndices.
1201
The backing indices must implement GraphIndex, and are presumed to be
1204
Queries against the combined index will be made against the first index,
1205
and then the second and so on. The order of index's can thus influence
1206
performance significantly. For example, if one index is on local disk and a
1207
second on a remote server, the local disk index should be before the other
1211
def __init__(self, indices, reload_func=None):
1212
"""Create a CombinedGraphIndex backed by indices.
1214
:param indices: An ordered list of indices to query for data.
1215
:param reload_func: A function to call if we find we are missing an
1216
index. Should have the form reload_func() => True/False to indicate
1217
if reloading actually changed anything.
1219
self._indices = indices
1220
self._reload_func = reload_func
1224
self.__class__.__name__,
1225
', '.join(map(repr, self._indices)))
1227
def get_parent_map(self, keys):
1228
"""See graph.StackedParentsProvider.get_parent_map"""
1229
search_keys = set(keys)
1230
if NULL_REVISION in search_keys:
1231
search_keys.discard(NULL_REVISION)
1232
found_parents = {NULL_REVISION:[]}
1235
for index, key, value, refs in self.iter_entries(search_keys):
1238
parents = (NULL_REVISION,)
1239
found_parents[key] = parents
1240
return found_parents
1242
has_key = _has_key_from_parent_map
1244
def insert_index(self, pos, index):
1245
"""Insert a new index in the list of indices to query.
1247
:param pos: The position to insert the index.
1248
:param index: The index to insert.
1250
self._indices.insert(pos, index)
1252
def iter_all_entries(self):
1253
"""Iterate over all keys within the index
1255
Duplicate keys across child indices are presumed to have the same
1256
value and are only reported once.
1258
:return: An iterable of (index, key, reference_lists, value).
1259
There is no defined order for the result iteration - it will be in
1260
the most efficient order for the index.
1265
for index in self._indices:
1266
for node in index.iter_all_entries():
1267
if node[1] not in seen_keys:
1269
seen_keys.add(node[1])
1271
except errors.NoSuchFile:
1272
self._reload_or_raise()
1274
def iter_entries(self, keys):
1275
"""Iterate over keys within the index.
1277
Duplicate keys across child indices are presumed to have the same
1278
value and are only reported once.
1280
:param keys: An iterable providing the keys to be retrieved.
1281
:return: An iterable of (index, key, reference_lists, value). There is no
1282
defined order for the result iteration - it will be in the most
1283
efficient order for the index.
1288
for index in self._indices:
1291
for node in index.iter_entries(keys):
1292
keys.remove(node[1])
1295
except errors.NoSuchFile:
1296
self._reload_or_raise()
1298
def iter_entries_prefix(self, keys):
1299
"""Iterate over keys within the index using prefix matching.
1301
Duplicate keys across child indices are presumed to have the same
1302
value and are only reported once.
1304
Prefix matching is applied within the tuple of a key, not to within
1305
the bytestring of each key element. e.g. if you have the keys ('foo',
1306
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1307
only the former key is returned.
1309
:param keys: An iterable providing the key prefixes to be retrieved.
1310
Each key prefix takes the form of a tuple the length of a key, but
1311
with the last N elements 'None' rather than a regular bytestring.
1312
The first element cannot be 'None'.
1313
:return: An iterable as per iter_all_entries, but restricted to the
1314
keys with a matching prefix to those supplied. No additional keys
1315
will be returned, and every match that is in the index will be
1324
for index in self._indices:
1325
for node in index.iter_entries_prefix(keys):
1326
if node[1] in seen_keys:
1328
seen_keys.add(node[1])
1331
except errors.NoSuchFile:
1332
self._reload_or_raise()
1334
def find_ancestry(self, keys, ref_list_num):
1335
"""Find the complete ancestry for the given set of keys.
1337
Note that this is a whole-ancestry request, so it should be used
1340
:param keys: An iterable of keys to look for
1341
:param ref_list_num: The reference list which references the parents
1343
:return: (parent_map, missing_keys)
1345
missing_keys = set()
1347
keys_to_lookup = set(keys)
1349
while keys_to_lookup:
1350
# keys that *all* indexes claim are missing, stop searching them
1352
all_index_missing = None
1353
# print 'gen\tidx\tsub\tn_keys\tn_pmap\tn_miss'
1354
# print '%4d\t\t\t%4d\t%5d\t%5d' % (generation, len(keys_to_lookup),
1356
# len(missing_keys))
1357
for index_idx, index in enumerate(self._indices):
1358
# TODO: we should probably be doing something with
1359
# 'missing_keys' since we've already determined that
1360
# those revisions have not been found anywhere
1361
index_missing_keys = set()
1362
# Find all of the ancestry we can from this index
1363
# keep looking until the search_keys set is empty, which means
1364
# things we didn't find should be in index_missing_keys
1365
search_keys = keys_to_lookup
1367
# print ' \t%2d\t\t%4d\t%5d\t%5d' % (
1368
# index_idx, len(search_keys),
1369
# len(parent_map), len(index_missing_keys))
1372
# TODO: ref_list_num should really be a parameter, since
1373
# CombinedGraphIndex does not know what the ref lists
1375
search_keys = index._find_ancestors(search_keys,
1376
ref_list_num, parent_map, index_missing_keys)
1377
# print ' \t \t%2d\t%4d\t%5d\t%5d' % (
1378
# sub_generation, len(search_keys),
1379
# len(parent_map), len(index_missing_keys))
1380
# Now set whatever was missing to be searched in the next index
1381
keys_to_lookup = index_missing_keys
1382
if all_index_missing is None:
1383
all_index_missing = set(index_missing_keys)
1385
all_index_missing.intersection_update(index_missing_keys)
1386
if not keys_to_lookup:
1388
if all_index_missing is None:
1389
# There were no indexes, so all search keys are 'missing'
1390
missing_keys.update(keys_to_lookup)
1391
keys_to_lookup = None
1393
missing_keys.update(all_index_missing)
1394
keys_to_lookup.difference_update(all_index_missing)
1395
return parent_map, missing_keys
1397
def key_count(self):
1398
"""Return an estimate of the number of keys in this index.
1400
For CombinedGraphIndex this is approximated by the sum of the keys of
1401
the child indices. As child indices may have duplicate keys this can
1402
have a maximum error of the number of child indices * largest number of
1407
return sum((index.key_count() for index in self._indices), 0)
1408
except errors.NoSuchFile:
1409
self._reload_or_raise()
1411
missing_keys = _missing_keys_from_parent_map
1413
def _reload_or_raise(self):
1414
"""We just got a NoSuchFile exception.
1416
Try to reload the indices, if it fails, just raise the current
1419
if self._reload_func is None:
1421
exc_type, exc_value, exc_traceback = sys.exc_info()
1422
trace.mutter('Trying to reload after getting exception: %s',
1424
if not self._reload_func():
1425
# We tried to reload, but nothing changed, so we fail anyway
1426
trace.mutter('_reload_func indicated nothing has changed.'
1427
' Raising original exception.')
1428
raise exc_type, exc_value, exc_traceback
1431
"""Validate that everything in the index can be accessed."""
1434
for index in self._indices:
1437
except errors.NoSuchFile:
1438
self._reload_or_raise()
1441
class InMemoryGraphIndex(GraphIndexBuilder):
1442
"""A GraphIndex which operates entirely out of memory and is mutable.
1444
This is designed to allow the accumulation of GraphIndex entries during a
1445
single write operation, where the accumulated entries need to be immediately
1446
available - for example via a CombinedGraphIndex.
1449
def add_nodes(self, nodes):
1450
"""Add nodes to the index.
1452
:param nodes: An iterable of (key, node_refs, value) entries to add.
1454
if self.reference_lists:
1455
for (key, value, node_refs) in nodes:
1456
self.add_node(key, value, node_refs)
1458
for (key, value) in nodes:
1459
self.add_node(key, value)
1461
def iter_all_entries(self):
1462
"""Iterate over all keys within the index
1464
:return: An iterable of (index, key, reference_lists, value). There is no
1465
defined order for the result iteration - it will be in the most
1466
efficient order for the index (in this case dictionary hash order).
1468
if 'evil' in debug.debug_flags:
1469
trace.mutter_callsite(3,
1470
"iter_all_entries scales with size of history.")
1471
if self.reference_lists:
1472
for key, (absent, references, value) in self._nodes.iteritems():
1474
yield self, key, value, references
1476
for key, (absent, references, value) in self._nodes.iteritems():
1478
yield self, key, value
1480
def iter_entries(self, keys):
1481
"""Iterate over keys within the index.
1483
:param keys: An iterable providing the keys to be retrieved.
1484
:return: An iterable of (index, key, value, reference_lists). There is no
1485
defined order for the result iteration - it will be in the most
1486
efficient order for the index (keys iteration order in this case).
1489
if self.reference_lists:
1490
for key in keys.intersection(self._keys):
1491
node = self._nodes[key]
1493
yield self, key, node[2], node[1]
1495
for key in keys.intersection(self._keys):
1496
node = self._nodes[key]
1498
yield self, key, node[2]
1500
def iter_entries_prefix(self, keys):
1501
"""Iterate over keys within the index using prefix matching.
1503
Prefix matching is applied within the tuple of a key, not to within
1504
the bytestring of each key element. e.g. if you have the keys ('foo',
1505
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1506
only the former key is returned.
1508
:param keys: An iterable providing the key prefixes to be retrieved.
1509
Each key prefix takes the form of a tuple the length of a key, but
1510
with the last N elements 'None' rather than a regular bytestring.
1511
The first element cannot be 'None'.
1512
:return: An iterable as per iter_all_entries, but restricted to the
1513
keys with a matching prefix to those supplied. No additional keys
1514
will be returned, and every match that is in the index will be
1517
# XXX: To much duplication with the GraphIndex class; consider finding
1518
# a good place to pull out the actual common logic.
1522
if self._key_length == 1:
1526
raise errors.BadIndexKey(key)
1527
if len(key) != self._key_length:
1528
raise errors.BadIndexKey(key)
1529
node = self._nodes[key]
1532
if self.reference_lists:
1533
yield self, key, node[2], node[1]
1535
yield self, key, node[2]
1537
nodes_by_key = self._get_nodes_by_key()
1541
raise errors.BadIndexKey(key)
1542
if len(key) != self._key_length:
1543
raise errors.BadIndexKey(key)
1544
# find what it refers to:
1545
key_dict = nodes_by_key
1546
elements = list(key)
1547
# find the subdict to return
1549
while len(elements) and elements[0] is not None:
1550
key_dict = key_dict[elements[0]]
1553
# a non-existant lookup.
1558
key_dict = dicts.pop(-1)
1559
# can't be empty or would not exist
1560
item, value = key_dict.iteritems().next()
1561
if type(value) == dict:
1563
dicts.extend(key_dict.itervalues())
1566
for value in key_dict.itervalues():
1567
yield (self, ) + value
1569
yield (self, ) + key_dict
1571
def key_count(self):
1572
"""Return an estimate of the number of keys in this index.
1574
For InMemoryGraphIndex the estimate is exact.
1576
return len(self._keys)
1579
"""In memory index's have no known corruption at the moment."""
1582
class GraphIndexPrefixAdapter(object):
1583
"""An adapter between GraphIndex with different key lengths.
1585
Queries against this will emit queries against the adapted Graph with the
1586
prefix added, queries for all items use iter_entries_prefix. The returned
1587
nodes will have their keys and node references adjusted to remove the
1588
prefix. Finally, an add_nodes_callback can be supplied - when called the
1589
nodes and references being added will have prefix prepended.
1592
def __init__(self, adapted, prefix, missing_key_length,
1593
add_nodes_callback=None):
1594
"""Construct an adapter against adapted with prefix."""
1595
self.adapted = adapted
1596
self.prefix_key = prefix + (None,)*missing_key_length
1597
self.prefix = prefix
1598
self.prefix_len = len(prefix)
1599
self.add_nodes_callback = add_nodes_callback
1601
def add_nodes(self, nodes):
1602
"""Add nodes to the index.
1604
:param nodes: An iterable of (key, node_refs, value) entries to add.
1606
# save nodes in case its an iterator
1607
nodes = tuple(nodes)
1608
translated_nodes = []
1610
# Add prefix_key to each reference node_refs is a tuple of tuples,
1611
# so split it apart, and add prefix_key to the internal reference
1612
for (key, value, node_refs) in nodes:
1613
adjusted_references = (
1614
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1615
for ref_list in node_refs))
1616
translated_nodes.append((self.prefix + key, value,
1617
adjusted_references))
1619
# XXX: TODO add an explicit interface for getting the reference list
1620
# status, to handle this bit of user-friendliness in the API more
1622
for (key, value) in nodes:
1623
translated_nodes.append((self.prefix + key, value))
1624
self.add_nodes_callback(translated_nodes)
1626
def add_node(self, key, value, references=()):
1627
"""Add a node to the index.
1629
:param key: The key. keys are non-empty tuples containing
1630
as many whitespace-free utf8 bytestrings as the key length
1631
defined for this index.
1632
:param references: An iterable of iterables of keys. Each is a
1633
reference to another key.
1634
:param value: The value to associate with the key. It may be any
1635
bytes as long as it does not contain \0 or \n.
1637
self.add_nodes(((key, value, references), ))
1639
def _strip_prefix(self, an_iter):
1640
"""Strip prefix data from nodes and return it."""
1641
for node in an_iter:
1643
if node[1][:self.prefix_len] != self.prefix:
1644
raise errors.BadIndexData(self)
1645
for ref_list in node[3]:
1646
for ref_node in ref_list:
1647
if ref_node[:self.prefix_len] != self.prefix:
1648
raise errors.BadIndexData(self)
1649
yield node[0], node[1][self.prefix_len:], node[2], (
1650
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1651
for ref_list in node[3]))
1653
def iter_all_entries(self):
1654
"""Iterate over all keys within the index
1656
iter_all_entries is implemented against the adapted index using
1657
iter_entries_prefix.
1659
:return: An iterable of (index, key, reference_lists, value). There is no
1660
defined order for the result iteration - it will be in the most
1661
efficient order for the index (in this case dictionary hash order).
1663
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1665
def iter_entries(self, keys):
1666
"""Iterate over keys within the index.
1668
:param keys: An iterable providing the keys to be retrieved.
1669
:return: An iterable of (index, key, value, reference_lists). There is no
1670
defined order for the result iteration - it will be in the most
1671
efficient order for the index (keys iteration order in this case).
1673
return self._strip_prefix(self.adapted.iter_entries(
1674
self.prefix + key for key in keys))
1676
def iter_entries_prefix(self, keys):
1677
"""Iterate over keys within the index using prefix matching.
1679
Prefix matching is applied within the tuple of a key, not to within
1680
the bytestring of each key element. e.g. if you have the keys ('foo',
1681
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1682
only the former key is returned.
1684
:param keys: An iterable providing the key prefixes to be retrieved.
1685
Each key prefix takes the form of a tuple the length of a key, but
1686
with the last N elements 'None' rather than a regular bytestring.
1687
The first element cannot be 'None'.
1688
:return: An iterable as per iter_all_entries, but restricted to the
1689
keys with a matching prefix to those supplied. No additional keys
1690
will be returned, and every match that is in the index will be
1693
return self._strip_prefix(self.adapted.iter_entries_prefix(
1694
self.prefix + key for key in keys))
1696
def key_count(self):
1697
"""Return an estimate of the number of keys in this index.
1699
For GraphIndexPrefixAdapter this is relatively expensive - key
1700
iteration with the prefix is done.
1702
return len(list(self.iter_all_entries()))
1705
"""Call the adapted's validate."""
1706
self.adapted.validate()