55
63
VALUE := no-newline-no-null-bytes
58
def __init__(self, reference_lists=0):
66
def __init__(self, reference_lists=0, key_elements=1):
59
67
"""Create a GraphIndex builder.
61
69
:param reference_lists: The number of node references lists for each
71
:param key_elements: The number of bytestrings in each key.
64
73
self.reference_lists = reference_lists
76
self._nodes_by_key = {}
77
self._key_length = key_elements
79
def _check_key(self, key):
80
"""Raise BadIndexKey if key is not a valid key for this index."""
81
if type(key) != tuple:
82
raise errors.BadIndexKey(key)
83
if self._key_length != len(key):
84
raise errors.BadIndexKey(key)
86
if not element or _whitespace_re.search(element) is not None:
87
raise errors.BadIndexKey(element)
67
89
def add_node(self, key, value, references=()):
68
90
"""Add a node to the index.
70
:param key: The key. keys must be whitespace-free utf8.
92
:param key: The key. keys are non-empty tuples containing
93
as many whitespace-free utf8 bytestrings as the key length
94
defined for this index.
71
95
:param references: An iterable of iterables of keys. Each is a
72
96
reference to another key.
73
97
:param value: The value to associate with the key. It may be any
74
98
bytes as long as it does not contain \0 or \n.
76
if not key or _whitespace_re.search(key) is not None:
77
raise errors.BadIndexKey(key)
78
101
if _newline_null_re.search(value) is not None:
79
102
raise errors.BadIndexValue(value)
80
103
if len(references) != self.reference_lists:
83
106
for reference_list in references:
84
107
for reference in reference_list:
85
if _whitespace_re.search(reference) is not None:
86
raise errors.BadIndexKey(reference)
108
self._check_key(reference)
87
109
if reference not in self._nodes:
88
110
self._nodes[reference] = ('a', (), '')
89
111
node_refs.append(tuple(reference_list))
90
112
if key in self._nodes and self._nodes[key][0] == '':
91
113
raise errors.BadIndexDuplicateKey(key, self)
92
114
self._nodes[key] = ('', tuple(node_refs), value)
116
if self._key_length > 1:
117
key_dict = self._nodes_by_key
118
if self.reference_lists:
119
key_value = key, value, tuple(node_refs)
121
key_value = key, value
122
# possibly should do this on-demand, but it seems likely it is
124
# For a key of (foo, bar, baz) create
125
# _nodes_by_key[foo][bar][baz] = key_value
126
for subkey in key[:-1]:
127
key_dict = key_dict.setdefault(subkey, {})
128
key_dict[key[-1]] = key_value
95
131
lines = [_SIGNATURE]
96
132
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
97
prefix_length = len(lines[0]) + len(lines[1])
133
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
134
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
135
prefix_length = sum(len(x) for x in lines)
98
136
# references are byte offsets. To avoid having to do nasty
99
137
# polynomial work to resolve offsets (references to later in the
100
138
# file cannot be determined until all the inbetween references have
197
241
self._transport = transport
198
242
self._name = name
200
def iter_all_entries(self):
201
"""Iterate over all keys within the index.
203
:return: An iterable of (key, value) or (key, value, reference_lists).
204
The former tuple is used when there are no reference lists in the
205
index, making the API compatible with simple key:value index types.
206
There is no defined order for the result iteration - it will be in
207
the most efficient order for the index.
244
self._key_count = None
245
self._keys_by_offset = None
246
self._nodes_by_key = None
248
def _buffer_all(self):
249
"""Buffer all the index data.
251
Mutates self._nodes and self.keys_by_offset.
253
if 'index' in debug.debug_flags:
254
mutter('Reading entire index %s', self._transport.abspath(self._name))
209
255
stream = self._transport.get(self._name)
210
256
self._read_prefix(stream)
257
expected_elements = 3 + self._key_length
212
self.keys_by_offset = {}
259
# raw data keyed by offset
260
self._keys_by_offset = {}
261
# ready-to-return key:value or key:value, node_ref_lists
263
self._nodes_by_key = {}
214
265
pos = stream.tell()
215
266
for line in stream.readlines():
219
key, absent, references, value = line.split('\0')
270
elements = line.split('\0')
271
if len(elements) != expected_elements:
272
raise errors.BadIndexData(self)
274
key = tuple(elements[:self._key_length])
275
absent, references, value = elements[-3:]
220
276
value = value[:-1] # remove the newline
222
278
for ref_string in references.split('\t'):
224
280
int(ref) for ref in ref_string.split('\r') if ref
226
282
ref_lists = tuple(ref_lists)
227
self.keys_by_offset[pos] = (key, absent, ref_lists, value)
283
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
229
for key, absent, references, value in self.keys_by_offset.itervalues():
285
for key, absent, references, value in self._keys_by_offset.itervalues():
232
288
# resolve references:
233
289
if self.node_ref_lists:
235
291
for ref_list in references:
236
node_refs.append(tuple([self.keys_by_offset[ref][0] for ref in ref_list]))
237
yield (key, value, tuple(node_refs))
292
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
293
node_value = (value, tuple(node_refs))
296
self._nodes[key] = node_value
297
if self._key_length > 1:
298
subkey = list(reversed(key[:-1]))
299
key_dict = self._nodes_by_key
300
if self.node_ref_lists:
301
key_value = key, node_value[0], node_value[1]
303
key_value = key, node_value
304
# possibly should do this on-demand, but it seems likely it is
306
# For a key of (foo, bar, baz) create
307
# _nodes_by_key[foo][bar][baz] = key_value
308
for subkey in key[:-1]:
309
key_dict = key_dict.setdefault(subkey, {})
310
key_dict[key[-1]] = key_value
311
# cache the keys for quick set intersections
312
self._keys = set(self._nodes)
240
313
if trailers != 1:
241
314
# there must be one line - the empty trailer line.
242
315
raise errors.BadIndexData(self)
317
def iter_all_entries(self):
318
"""Iterate over all keys within the index.
320
:return: An iterable of (key, value) or (key, value, reference_lists).
321
The former tuple is used when there are no reference lists in the
322
index, making the API compatible with simple key:value index types.
323
There is no defined order for the result iteration - it will be in
324
the most efficient order for the index.
326
if 'evil' in debug.debug_flags:
327
trace.mutter_callsite(2,
328
"iter_all_entries scales with size of history.")
329
if self._nodes is None:
331
if self.node_ref_lists:
332
for key, (value, node_ref_lists) in self._nodes.iteritems():
333
yield self, key, value, node_ref_lists
335
for key, value in self._nodes.iteritems():
336
yield self, key, value
244
338
def _read_prefix(self, stream):
245
339
signature = stream.read(len(self._signature()))
246
340
if not signature == self._signature():
267
for node in self.iter_all_entries():
375
if self._nodes is None:
377
keys = keys.intersection(self._keys)
378
if self.node_ref_lists:
380
value, node_refs = self._nodes[key]
381
yield self, key, value, node_refs
384
yield self, key, self._nodes[key]
386
def iter_entries_prefix(self, keys):
387
"""Iterate over keys within the index using prefix matching.
389
Prefix matching is applied within the tuple of a key, not to within
390
the bytestring of each key element. e.g. if you have the keys ('foo',
391
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
392
only the former key is returned.
394
:param keys: An iterable providing the key prefixes to be retrieved.
395
Each key prefix takes the form of a tuple the length of a key, but
396
with the last N elements 'None' rather than a regular bytestring.
397
The first element cannot be 'None'.
398
:return: An iterable as per iter_all_entries, but restricted to the
399
keys with a matching prefix to those supplied. No additional keys
400
will be returned, and every match that is in the index will be
406
# load data - also finds key lengths
407
if self._nodes is None:
409
if self._key_length == 1:
413
raise errors.BadIndexKey(key)
414
if len(key) != self._key_length:
415
raise errors.BadIndexKey(key)
416
if self.node_ref_lists:
417
value, node_refs = self._nodes[key]
418
yield self, key, value, node_refs
420
yield self, key, self._nodes[key]
425
raise errors.BadIndexKey(key)
426
if len(key) != self._key_length:
427
raise errors.BadIndexKey(key)
428
# find what it refers to:
429
key_dict = self._nodes_by_key
431
# find the subdict whose contents should be returned.
433
while len(elements) and elements[0] is not None:
434
key_dict = key_dict[elements[0]]
437
# a non-existant lookup.
442
key_dict = dicts.pop(-1)
443
# can't be empty or would not exist
444
item, value = key_dict.iteritems().next()
445
if type(value) == dict:
447
dicts.extend(key_dict.itervalues())
450
for value in key_dict.itervalues():
451
# each value is the key:value:node refs tuple
453
yield (self, ) + value
455
# the last thing looked up was a terminal element
456
yield (self, ) + key_dict
459
"""Return an estimate of the number of keys in this index.
461
For GraphIndex the estimate is exact.
463
if self._key_count is None:
464
# really this should just read the prefix
466
return self._key_count
274
468
def _signature(self):
275
469
"""The file signature for this index type."""
345
539
for node in index.iter_entries(keys):
543
def iter_entries_prefix(self, keys):
544
"""Iterate over keys within the index using prefix matching.
546
Duplicate keys across child indices are presumed to have the same
547
value and are only reported once.
549
Prefix matching is applied within the tuple of a key, not to within
550
the bytestring of each key element. e.g. if you have the keys ('foo',
551
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
552
only the former key is returned.
554
:param keys: An iterable providing the key prefixes to be retrieved.
555
Each key prefix takes the form of a tuple the length of a key, but
556
with the last N elements 'None' rather than a regular bytestring.
557
The first element cannot be 'None'.
558
:return: An iterable as per iter_all_entries, but restricted to the
559
keys with a matching prefix to those supplied. No additional keys
560
will be returned, and every match that is in the index will be
567
for index in self._indices:
568
for node in index.iter_entries_prefix(keys):
569
if node[1] in seen_keys:
571
seen_keys.add(node[1])
575
"""Return an estimate of the number of keys in this index.
577
For CombinedGraphIndex this is approximated by the sum of the keys of
578
the child indices. As child indices may have duplicate keys this can
579
have a maximum error of the number of child indices * largest number of
582
return sum((index.key_count() for index in self._indices), 0)
349
584
def validate(self):
350
585
"""Validate that everything in the index can be accessed."""
396
638
if self.reference_lists:
397
for key in keys.intersection(self._nodes):
639
for key in keys.intersection(self._keys):
398
640
node = self._nodes[key]
400
yield key, node[2], node[1]
642
yield self, key, node[2], node[1]
402
for key in keys.intersection(self._nodes):
644
for key in keys.intersection(self._keys):
403
645
node = self._nodes[key]
647
yield self, key, node[2]
649
def iter_entries_prefix(self, keys):
650
"""Iterate over keys within the index using prefix matching.
652
Prefix matching is applied within the tuple of a key, not to within
653
the bytestring of each key element. e.g. if you have the keys ('foo',
654
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
655
only the former key is returned.
657
:param keys: An iterable providing the key prefixes to be retrieved.
658
Each key prefix takes the form of a tuple the length of a key, but
659
with the last N elements 'None' rather than a regular bytestring.
660
The first element cannot be 'None'.
661
:return: An iterable as per iter_all_entries, but restricted to the
662
keys with a matching prefix to those supplied. No additional keys
663
will be returned, and every match that is in the index will be
666
# XXX: To much duplication with the GraphIndex class; consider finding
667
# a good place to pull out the actual common logic.
671
if self._key_length == 1:
675
raise errors.BadIndexKey(key)
676
if len(key) != self._key_length:
677
raise errors.BadIndexKey(key)
678
node = self._nodes[key]
681
if self.reference_lists:
682
yield self, key, node[2], node[1]
684
yield self, key, node[2]
689
raise errors.BadIndexKey(key)
690
if len(key) != self._key_length:
691
raise errors.BadIndexKey(key)
692
# find what it refers to:
693
key_dict = self._nodes_by_key
695
# find the subdict to return
697
while len(elements) and elements[0] is not None:
698
key_dict = key_dict[elements[0]]
701
# a non-existant lookup.
706
key_dict = dicts.pop(-1)
707
# can't be empty or would not exist
708
item, value = key_dict.iteritems().next()
709
if type(value) == dict:
711
dicts.extend(key_dict.itervalues())
714
for value in key_dict.itervalues():
715
yield (self, ) + value
717
yield (self, ) + key_dict
720
"""Return an estimate of the number of keys in this index.
722
For InMemoryGraphIndex the estimate is exact.
724
return len(self._keys)
407
726
def validate(self):
408
727
"""In memory index's have no known corruption at the moment."""
730
class GraphIndexPrefixAdapter(object):
731
"""An adapter between GraphIndex with different key lengths.
733
Queries against this will emit queries against the adapted Graph with the
734
prefix added, queries for all items use iter_entries_prefix. The returned
735
nodes will have their keys and node references adjusted to remove the
736
prefix. Finally, an add_nodes_callback can be supplied - when called the
737
nodes and references being added will have prefix prepended.
740
def __init__(self, adapted, prefix, missing_key_length,
741
add_nodes_callback=None):
742
"""Construct an adapter against adapted with prefix."""
743
self.adapted = adapted
744
self.prefix_key = prefix + (None,)*missing_key_length
746
self.prefix_len = len(prefix)
747
self.add_nodes_callback = add_nodes_callback
749
def add_nodes(self, nodes):
750
"""Add nodes to the index.
752
:param nodes: An iterable of (key, node_refs, value) entries to add.
754
# save nodes in case its an iterator
756
translated_nodes = []
758
# Add prefix_key to each reference node_refs is a tuple of tuples,
759
# so split it apart, and add prefix_key to the internal reference
760
for (key, value, node_refs) in nodes:
761
adjusted_references = (
762
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
763
for ref_list in node_refs))
764
translated_nodes.append((self.prefix + key, value,
765
adjusted_references))
767
# XXX: TODO add an explicit interface for getting the reference list
768
# status, to handle this bit of user-friendliness in the API more
770
for (key, value) in nodes:
771
translated_nodes.append((self.prefix + key, value))
772
self.add_nodes_callback(translated_nodes)
774
def add_node(self, key, value, references=()):
775
"""Add a node to the index.
777
:param key: The key. keys are non-empty tuples containing
778
as many whitespace-free utf8 bytestrings as the key length
779
defined for this index.
780
:param references: An iterable of iterables of keys. Each is a
781
reference to another key.
782
:param value: The value to associate with the key. It may be any
783
bytes as long as it does not contain \0 or \n.
785
self.add_nodes(((key, value, references), ))
787
def _strip_prefix(self, an_iter):
788
"""Strip prefix data from nodes and return it."""
791
if node[1][:self.prefix_len] != self.prefix:
792
raise errors.BadIndexData(self)
793
for ref_list in node[3]:
794
for ref_node in ref_list:
795
if ref_node[:self.prefix_len] != self.prefix:
796
raise errors.BadIndexData(self)
797
yield node[0], node[1][self.prefix_len:], node[2], (
798
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
799
for ref_list in node[3]))
801
def iter_all_entries(self):
802
"""Iterate over all keys within the index
804
iter_all_entries is implemented against the adapted index using
807
:return: An iterable of (key, reference_lists, value). There is no
808
defined order for the result iteration - it will be in the most
809
efficient order for the index (in this case dictionary hash order).
811
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
813
def iter_entries(self, keys):
814
"""Iterate over keys within the index.
816
:param keys: An iterable providing the keys to be retrieved.
817
:return: An iterable of (key, reference_lists, value). There is no
818
defined order for the result iteration - it will be in the most
819
efficient order for the index (keys iteration order in this case).
821
return self._strip_prefix(self.adapted.iter_entries(
822
self.prefix + key for key in keys))
824
def iter_entries_prefix(self, keys):
825
"""Iterate over keys within the index using prefix matching.
827
Prefix matching is applied within the tuple of a key, not to within
828
the bytestring of each key element. e.g. if you have the keys ('foo',
829
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
830
only the former key is returned.
832
:param keys: An iterable providing the key prefixes to be retrieved.
833
Each key prefix takes the form of a tuple the length of a key, but
834
with the last N elements 'None' rather than a regular bytestring.
835
The first element cannot be 'None'.
836
:return: An iterable as per iter_all_entries, but restricted to the
837
keys with a matching prefix to those supplied. No additional keys
838
will be returned, and every match that is in the index will be
841
return self._strip_prefix(self.adapted.iter_entries_prefix(
842
self.prefix + key for key in keys))
845
"""Return an estimate of the number of keys in this index.
847
For GraphIndexPrefixAdapter this is relatively expensive - key
848
iteration with the prefix is done.
850
return len(list(self.iter_all_entries()))
853
"""Call the adapted's validate."""
854
self.adapted.validate()