27
27
from bisect import bisect_right
28
from io import BytesIO
28
from cStringIO import StringIO
31
from ..lazy_import import lazy_import
32
from bzrlib.lazy_import import lazy_import
32
33
lazy_import(globals(), """
35
revision as _mod_revision,
34
from bzrlib import trace
35
from bzrlib.bisect_multi import bisect_multi_bytes
36
from bzrlib.revision import NULL_REVISION
37
from bzrlib.trace import mutter
43
from ..static_tuple import StaticTuple
45
44
_HEADER_READV = (0, 200)
46
_OPTION_KEY_ELEMENTS = b"key_elements="
48
_OPTION_NODE_REFS = b"node_ref_lists="
49
_SIGNATURE = b"Bazaar Graph Index 1\n"
52
class BadIndexFormatSignature(errors.BzrError):
54
_fmt = "%(value)s is not an index of type %(_type)s."
56
def __init__(self, value, _type):
57
errors.BzrError.__init__(self)
62
class BadIndexData(errors.BzrError):
64
_fmt = "Error in data for index %(value)s."
66
def __init__(self, value):
67
errors.BzrError.__init__(self)
71
class BadIndexDuplicateKey(errors.BzrError):
73
_fmt = "The key '%(key)s' is already in index '%(index)s'."
75
def __init__(self, key, index):
76
errors.BzrError.__init__(self)
81
class BadIndexKey(errors.BzrError):
83
_fmt = "The key '%(key)s' is not a valid key."
85
def __init__(self, key):
86
errors.BzrError.__init__(self)
90
class BadIndexOptions(errors.BzrError):
92
_fmt = "Could not parse options for index %(value)s."
94
def __init__(self, value):
95
errors.BzrError.__init__(self)
99
class BadIndexValue(errors.BzrError):
101
_fmt = "The value '%(value)s' is not a valid value."
103
def __init__(self, value):
104
errors.BzrError.__init__(self)
108
_whitespace_re = re.compile(b'[\t\n\x0b\x0c\r\x00 ]')
109
_newline_null_re = re.compile(b'[\n\0]')
45
_OPTION_KEY_ELEMENTS = "key_elements="
47
_OPTION_NODE_REFS = "node_ref_lists="
48
_SIGNATURE = "Bazaar Graph Index 1\n"
51
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
52
_newline_null_re = re.compile('[\n\0]')
112
55
def _has_key_from_parent_map(self, key):
125
68
class GraphIndexBuilder(object):
126
69
"""A builder that can build a GraphIndex.
128
The resulting graph has the structure::
71
The resulting graph has the structure:
130
_SIGNATURE OPTIONS NODES NEWLINE
131
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
132
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
134
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
135
KEY := Not-whitespace-utf8
137
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
138
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
139
REFERENCE := DIGITS ; digits is the byte offset in the index of the
141
VALUE := no-newline-no-null-bytes
73
_SIGNATURE OPTIONS NODES NEWLINE
74
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
75
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
77
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
78
KEY := Not-whitespace-utf8
80
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
81
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
82
REFERENCE := DIGITS ; digits is the byte offset in the index of the
84
VALUE := no-newline-no-null-bytes
144
87
def __init__(self, reference_lists=0, key_elements=1):
149
92
:param key_elements: The number of bytestrings in each key.
151
94
self.reference_lists = reference_lists
152
96
# A dict of {key: (absent, ref_lists, value)}
154
# Keys that are referenced but not actually present in this index
155
self._absent_keys = set()
156
98
self._nodes_by_key = None
157
99
self._key_length = key_elements
158
100
self._optimize_for_size = False
161
103
def _check_key(self, key):
162
104
"""Raise BadIndexKey if key is not a valid key for this index."""
163
if type(key) not in (tuple, StaticTuple):
164
raise BadIndexKey(key)
105
if type(key) != tuple:
106
raise errors.BadIndexKey(key)
165
107
if self._key_length != len(key):
166
raise BadIndexKey(key)
108
raise errors.BadIndexKey(key)
167
109
for element in key:
168
if not element or type(element) != bytes or _whitespace_re.search(element) is not None:
169
raise BadIndexKey(key)
110
if not element or _whitespace_re.search(element) is not None:
111
raise errors.BadIndexKey(element)
171
113
def _external_references(self):
172
114
"""Return references that are not present in this index.
223
165
key_dict = self._nodes_by_key
224
166
if self.reference_lists:
225
key_value = StaticTuple(key, value, node_refs)
167
key_value = key, value, node_refs
227
key_value = StaticTuple(key, value)
169
key_value = key, value
228
170
for subkey in key[:-1]:
229
171
key_dict = key_dict.setdefault(subkey, {})
230
172
key_dict[key[-1]] = key_value
240
182
:param value: The value associate with this key. Must not contain
241
183
newlines or null characters.
242
184
:return: (node_refs, absent_references)
244
* node_refs: basically a packed form of 'references' where all
246
* absent_references: reference keys that are not in self._nodes.
247
This may contain duplicates if the same key is referenced in
185
node_refs basically a packed form of 'references' where all
187
absent_references reference keys that are not in self._nodes.
188
This may contain duplicates if the same key is
189
referenced in multiple lists.
250
as_st = StaticTuple.from_sequence
251
191
self._check_key(key)
252
192
if _newline_null_re.search(value) is not None:
253
raise BadIndexValue(value)
193
raise errors.BadIndexValue(value)
254
194
if len(references) != self.reference_lists:
255
raise BadIndexValue(references)
195
raise errors.BadIndexValue(references)
257
197
absent_references = []
258
198
for reference_list in references:
262
202
if reference not in self._nodes:
263
203
self._check_key(reference)
264
204
absent_references.append(reference)
265
reference_list = as_st([as_st(ref).intern()
266
for ref in reference_list])
267
node_refs.append(reference_list)
268
return as_st(node_refs), absent_references
205
node_refs.append(tuple(reference_list))
206
return tuple(node_refs), absent_references
270
208
def add_node(self, key, value, references=()):
271
209
"""Add a node to the index.
276
214
:param references: An iterable of iterables of keys. Each is a
277
215
reference to another key.
278
216
:param value: The value to associate with the key. It may be any
279
bytes as long as it does not contain \\0 or \\n.
217
bytes as long as it does not contain \0 or \n.
282
220
absent_references) = self._check_key_ref_value(key, references, value)
283
if key in self._nodes and self._nodes[key][0] != b'a':
284
raise BadIndexDuplicateKey(key, self)
221
if key in self._nodes and self._nodes[key][0] != 'a':
222
raise errors.BadIndexDuplicateKey(key, self)
285
223
for reference in absent_references:
286
224
# There may be duplicates, but I don't think it is worth worrying
288
self._nodes[reference] = (b'a', (), b'')
289
self._absent_keys.update(absent_references)
290
self._absent_keys.discard(key)
291
self._nodes[key] = (b'', node_refs, value)
226
self._nodes[reference] = ('a', (), '')
227
self._nodes[key] = ('', node_refs, value)
292
229
if self._nodes_by_key is not None and self._key_length > 1:
293
230
self._update_nodes_by_key(key, value, node_refs)
295
def clear_cache(self):
296
"""See GraphIndex.clear_cache()
298
This is a no-op, but we need the api to conform to a generic 'Index'
302
232
def finish(self):
305
:returns: cBytesIO holding the full context of the index as it
306
should be written to disk.
308
233
lines = [_SIGNATURE]
309
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
310
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
311
key_count = len(self._nodes) - len(self._absent_keys)
312
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
234
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
235
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
236
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
313
237
prefix_length = sum(len(x) for x in lines)
314
238
# references are byte offsets. To avoid having to do nasty
315
239
# polynomial work to resolve offsets (references to later in the
362
286
non_ref_bytes += len(ref_list) - 1
363
287
# how many digits are needed to represent the total byte count?
365
possible_total_bytes = non_ref_bytes + total_references * digits
289
possible_total_bytes = non_ref_bytes + total_references*digits
366
290
while 10 ** digits < possible_total_bytes:
368
possible_total_bytes = non_ref_bytes + total_references * digits
369
expected_bytes = possible_total_bytes + 1 # terminating newline
292
possible_total_bytes = non_ref_bytes + total_references*digits
293
expected_bytes = possible_total_bytes + 1 # terminating newline
370
294
# resolve key addresses.
371
295
key_addresses = {}
372
296
for key, non_ref_bytes, total_references in key_offset_info:
373
key_addresses[key] = non_ref_bytes + total_references * digits
297
key_addresses[key] = non_ref_bytes + total_references*digits
375
format_string = b'%%0%dd' % digits
299
format_string = '%%0%sd' % digits
376
300
for key, (absent, references, value) in nodes:
377
301
flattened_references = []
378
302
for ref_list in references:
379
303
ref_addresses = []
380
304
for reference in ref_list:
381
ref_addresses.append(format_string %
382
key_addresses[reference])
383
flattened_references.append(b'\r'.join(ref_addresses))
384
string_key = b'\x00'.join(key)
385
lines.append(b"%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
386
b'\t'.join(flattened_references), value))
388
result = BytesIO(b''.join(lines))
305
ref_addresses.append(format_string % key_addresses[reference])
306
flattened_references.append('\r'.join(ref_addresses))
307
string_key = '\x00'.join(key)
308
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
309
'\t'.join(flattened_references), value))
311
result = StringIO(''.join(lines))
389
312
if expected_bytes and len(result.getvalue()) != expected_bytes:
390
313
raise errors.BzrError('Failed index creation. Internal error:'
391
' mismatched output length and expected length: %d %d' %
392
(len(result.getvalue()), expected_bytes))
314
' mismatched output length and expected length: %d %d' %
315
(len(result.getvalue()), expected_bytes))
395
318
def set_optimize(self, for_size=None, combine_backing_indices=None):
445
368
suitable for production use. :XXX
448
def __init__(self, transport, name, size, unlimited_cache=False, offset=0):
371
def __init__(self, transport, name, size):
449
372
"""Open an index called name on transport.
451
:param transport: A breezy.transport.Transport.
374
:param transport: A bzrlib.transport.Transport.
452
375
:param name: A path to provide to transport API calls.
453
376
:param size: The size of the index in bytes. This is used for bisection
454
377
logic to perform partial index reads. While the size could be
481
402
self._size = size
482
403
# The number of bytes we've read so far in trying to process this file
483
404
self._bytes_read = 0
484
self._base_offset = offset
486
406
def __eq__(self, other):
487
407
"""Equal when self and other were created with the same parameters."""
489
isinstance(self, type(other)) and
409
type(self) == type(other) and
490
410
self._transport == other._transport and
491
411
self._name == other._name and
492
412
self._size == other._size)
494
414
def __ne__(self, other):
495
415
return not self.__eq__(other)
497
def __lt__(self, other):
498
# We don't really care about the order, just that there is an order.
499
if (not isinstance(other, GraphIndex) and
500
not isinstance(other, InMemoryGraphIndex)):
501
raise TypeError(other)
502
return hash(self) < hash(other)
505
return hash((type(self), self._transport, self._name, self._size))
507
417
def __repr__(self):
508
418
return "%s(%r)" % (self.__class__.__name__,
509
self._transport.abspath(self._name))
419
self._transport.abspath(self._name))
511
421
def _buffer_all(self, stream=None):
512
422
"""Buffer all the index data.
517
427
# We already did this
519
429
if 'index' in debug.debug_flags:
520
trace.mutter('Reading entire index %s',
521
self._transport.abspath(self._name))
430
mutter('Reading entire index %s', self._transport.abspath(self._name))
522
431
if stream is None:
523
432
stream = self._transport.get(self._name)
524
if self._base_offset != 0:
525
# This is wasteful, but it is better than dealing with
526
# adjusting all the offsets, etc.
527
stream = BytesIO(stream.read()[self._base_offset:])
529
self._read_prefix(stream)
530
self._expected_elements = 3 + self._key_length
532
# raw data keyed by offset
533
self._keys_by_offset = {}
534
# ready-to-return key:value or key:value, node_ref_lists
536
self._nodes_by_key = None
539
lines = stream.read().split(b'\n')
433
self._read_prefix(stream)
434
self._expected_elements = 3 + self._key_length
436
# raw data keyed by offset
437
self._keys_by_offset = {}
438
# ready-to-return key:value or key:value, node_ref_lists
440
self._nodes_by_key = None
443
lines = stream.read().split('\n')
543
445
_, _, _, trailers = self._parse_lines(lines, pos)
544
for key, absent, references, value in self._keys_by_offset.values():
446
for key, absent, references, value in self._keys_by_offset.itervalues():
547
449
# resolve references:
551
453
node_value = value
552
454
self._nodes[key] = node_value
553
455
# cache the keys for quick set intersections
456
self._keys = set(self._nodes)
554
457
if trailers != 1:
555
458
# there must be one line - the empty trailer line.
556
raise BadIndexData(self)
558
def clear_cache(self):
559
"""Clear out any cached/memoized values.
561
This can be called at any time, but generally it is used when we have
562
extracted some information, but don't expect to be requesting any more
459
raise errors.BadIndexData(self)
566
461
def external_references(self, ref_list_num):
567
462
"""Return references that are not present in this index.
569
464
self._buffer_all()
570
465
if ref_list_num + 1 > self.node_ref_lists:
571
466
raise ValueError('No ref list %d, index has %d ref lists'
572
% (ref_list_num, self.node_ref_lists))
467
% (ref_list_num, self.node_ref_lists))
575
for key, (value, ref_lists) in nodes.items():
469
for key, (value, ref_lists) in self._nodes.iteritems():
576
470
ref_list = ref_lists[ref_list_num]
577
refs.update([ref for ref in ref_list if ref not in nodes])
471
refs.update(ref_list)
472
return refs - self._keys
580
474
def _get_nodes_by_key(self):
581
475
if self._nodes_by_key is None:
582
476
nodes_by_key = {}
583
477
if self.node_ref_lists:
584
for key, (value, references) in self._nodes.items():
478
for key, (value, references) in self._nodes.iteritems():
585
479
key_dict = nodes_by_key
586
480
for subkey in key[:-1]:
587
481
key_dict = key_dict.setdefault(subkey, {})
588
482
key_dict[key[-1]] = key, value, references
590
for key, value in self._nodes.items():
484
for key, value in self._nodes.iteritems():
591
485
key_dict = nodes_by_key
592
486
for subkey in key[:-1]:
593
487
key_dict = key_dict.setdefault(subkey, {})
607
501
if 'evil' in debug.debug_flags:
608
502
trace.mutter_callsite(3,
609
"iter_all_entries scales with size of history.")
503
"iter_all_entries scales with size of history.")
610
504
if self._nodes is None:
611
505
self._buffer_all()
612
506
if self.node_ref_lists:
613
for key, (value, node_ref_lists) in self._nodes.items():
507
for key, (value, node_ref_lists) in self._nodes.iteritems():
614
508
yield self, key, value, node_ref_lists
616
for key, value in self._nodes.items():
510
for key, value in self._nodes.iteritems():
617
511
yield self, key, value
619
513
def _read_prefix(self, stream):
620
514
signature = stream.read(len(self._signature()))
621
515
if not signature == self._signature():
622
raise BadIndexFormatSignature(self._name, GraphIndex)
516
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
623
517
options_line = stream.readline()
624
518
if not options_line.startswith(_OPTION_NODE_REFS):
625
raise BadIndexOptions(self)
519
raise errors.BadIndexOptions(self)
627
521
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
628
522
except ValueError:
629
raise BadIndexOptions(self)
523
raise errors.BadIndexOptions(self)
630
524
options_line = stream.readline()
631
525
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
632
raise BadIndexOptions(self)
526
raise errors.BadIndexOptions(self)
634
528
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
635
529
except ValueError:
636
raise BadIndexOptions(self)
530
raise errors.BadIndexOptions(self)
637
531
options_line = stream.readline()
638
532
if not options_line.startswith(_OPTION_LEN):
639
raise BadIndexOptions(self)
533
raise errors.BadIndexOptions(self)
641
535
self._key_count = int(options_line[len(_OPTION_LEN):-1])
642
536
except ValueError:
643
raise BadIndexOptions(self)
537
raise errors.BadIndexOptions(self)
645
539
def _resolve_references(self, references):
646
540
"""Return the resolved key references for references.
656
550
for ref_list in references:
658
tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
551
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
659
552
return tuple(node_refs)
662
def _find_index(range_map, key):
554
def _find_index(self, range_map, key):
663
555
"""Helper for the _parsed_*_index calls.
665
557
Given a range map - [(start, end), ...], finds the index of the range
711
603
def _iter_entries_from_total_buffer(self, keys):
712
604
"""Iterate over keys when the entire index is parsed."""
713
# Note: See the note in BTreeBuilder.iter_entries for why we don't use
714
# .intersection() here
716
keys = [key for key in keys if key in nodes]
605
keys = keys.intersection(self._keys)
717
606
if self.node_ref_lists:
719
value, node_refs = nodes[key]
608
value, node_refs = self._nodes[key]
720
609
yield self, key, value, node_refs
723
yield self, key, nodes[key]
612
yield self, key, self._nodes[key]
725
614
def iter_entries(self, keys):
726
615
"""Iterate over keys within the index.
781
670
self._buffer_all()
782
671
if self._key_length == 1:
784
_sanity_check_key(self, key)
675
raise errors.BadIndexKey(key)
676
if len(key) != self._key_length:
677
raise errors.BadIndexKey(key)
785
678
if self.node_ref_lists:
786
679
value, node_refs = self._nodes[key]
787
680
yield self, key, value, node_refs
789
682
yield self, key, self._nodes[key]
791
684
nodes_by_key = self._get_nodes_by_key()
792
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
688
raise errors.BadIndexKey(key)
689
if len(key) != self._key_length:
690
raise errors.BadIndexKey(key)
691
# find what it refers to:
692
key_dict = nodes_by_key
694
# find the subdict whose contents should be returned.
696
while len(elements) and elements[0] is not None:
697
key_dict = key_dict[elements[0]]
700
# a non-existant lookup.
705
key_dict = dicts.pop(-1)
706
# can't be empty or would not exist
707
item, value = key_dict.iteritems().next()
708
if type(value) == dict:
710
dicts.extend(key_dict.itervalues())
713
for value in key_dict.itervalues():
714
# each value is the key:value:node refs tuple
716
yield (self, ) + value
718
# the last thing looked up was a terminal element
719
yield (self, ) + key_dict
795
721
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
796
722
"""See BTreeIndex._find_ancestors."""
878
804
# _read_and_parse triggered a _buffer_all because we requested the
879
805
# whole data range
880
806
for location, key in location_keys:
881
if key not in self._nodes: # not present
807
if key not in self._nodes: # not present
882
808
result.append(((location, key), False))
883
809
elif self.node_ref_lists:
884
810
value, refs = self._nodes[key]
885
811
result.append(((location, key),
886
(self, key, value, refs)))
812
(self, key, value, refs)))
888
814
result.append(((location, key),
889
(self, key, self._nodes[key])))
815
(self, key, self._nodes[key])))
891
817
# generate results:
892
818
# - figure out <, >, missing, present
911
837
pending_references.append((location, key))
913
839
result.append(((location, key), (self, key,
914
value, self._resolve_references(refs))))
840
value, self._resolve_references(refs))))
916
842
result.append(((location, key),
917
(self, key, self._bisect_nodes[key])))
843
(self, key, self._bisect_nodes[key])))
920
846
# has the region the key should be in, been parsed?
957
883
# answer key references we had to look-up-late.
958
884
value, refs = self._bisect_nodes[key]
959
885
result.append(((location, key), (self, key,
960
value, self._resolve_references(refs))))
886
value, self._resolve_references(refs))))
963
889
def _parse_header_from_bytes(self, bytes):
970
896
signature = bytes[0:len(self._signature())]
971
897
if not signature == self._signature():
972
raise BadIndexFormatSignature(self._name, GraphIndex)
898
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
973
899
lines = bytes[len(self._signature()):].splitlines()
974
900
options_line = lines[0]
975
901
if not options_line.startswith(_OPTION_NODE_REFS):
976
raise BadIndexOptions(self)
902
raise errors.BadIndexOptions(self)
978
904
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
979
905
except ValueError:
980
raise BadIndexOptions(self)
906
raise errors.BadIndexOptions(self)
981
907
options_line = lines[1]
982
908
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
983
raise BadIndexOptions(self)
909
raise errors.BadIndexOptions(self)
985
911
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
986
912
except ValueError:
987
raise BadIndexOptions(self)
913
raise errors.BadIndexOptions(self)
988
914
options_line = lines[2]
989
915
if not options_line.startswith(_OPTION_LEN):
990
raise BadIndexOptions(self)
916
raise errors.BadIndexOptions(self)
992
918
self._key_count = int(options_line[len(_OPTION_LEN):])
993
919
except ValueError:
994
raise BadIndexOptions(self)
920
raise errors.BadIndexOptions(self)
995
921
# calculate the bytes we have processed
996
922
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
998
self._parsed_bytes(0, (), header_end, ())
924
self._parsed_bytes(0, None, header_end, None)
999
925
# setup parsing state
1000
926
self._expected_elements = 3 + self._key_length
1001
927
# raw data keyed by offset
1101
1027
if not start_adjacent:
1102
1028
# work around python bug in rfind
1103
1029
if trim_start is None:
1104
trim_start = data.find(b'\n') + 1
1030
trim_start = data.find('\n') + 1
1106
trim_start = data.find(b'\n', trim_start) + 1
1032
trim_start = data.find('\n', trim_start) + 1
1107
1033
if not (trim_start != 0):
1108
1034
raise AssertionError('no \n was present')
1109
1035
# print 'removing start', offset, trim_start, repr(data[:trim_start])
1110
1036
if not end_adjacent:
1111
1037
# work around python bug in rfind
1112
1038
if trim_end is None:
1113
trim_end = data.rfind(b'\n') + 1
1039
trim_end = data.rfind('\n') + 1
1115
trim_end = data.rfind(b'\n', None, trim_end) + 1
1041
trim_end = data.rfind('\n', None, trim_end) + 1
1116
1042
if not (trim_end != 0):
1117
1043
raise AssertionError('no \n was present')
1118
1044
# print 'removing end', offset, trim_end, repr(data[trim_end:])
1120
1046
trimmed_data = data[trim_start:trim_end]
1121
1047
if not (trimmed_data):
1122
1048
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
1123
% (trim_start, trim_end, offset, offset + len(data)))
1049
% (trim_start, trim_end, offset, offset + len(data)))
1125
1051
offset += trim_start
1126
1052
# print "parsing", repr(trimmed_data)
1127
1053
# splitlines mangles the \r delimiters.. don't use it.
1128
lines = trimmed_data.split(b'\n')
1054
lines = trimmed_data.split('\n')
1131
1057
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
1132
1058
for key, value in nodes:
1133
1059
self._bisect_nodes[key] = value
1134
1060
self._parsed_bytes(offset, first_key,
1135
offset + len(trimmed_data), last_key)
1061
offset + len(trimmed_data), last_key)
1136
1062
return offset + len(trimmed_data), last_segment
1138
1064
def _parse_lines(self, lines, pos):
1143
1069
for line in lines:
1145
1071
# must be at the end
1147
1073
if not (self._size == pos + 1):
1148
1074
raise AssertionError("%s %s" % (self._size, pos))
1151
elements = line.split(b'\0')
1077
elements = line.split('\0')
1152
1078
if len(elements) != self._expected_elements:
1153
raise BadIndexData(self)
1079
raise errors.BadIndexData(self)
1154
1080
# keys are tuples. Each element is a string that may occur many
1155
1081
# times, so we intern them to save space. AB, RC, 200807
1156
key = tuple([element for element in elements[:self._key_length]])
1082
key = tuple([intern(element) for element in elements[:self._key_length]])
1157
1083
if first_key is None:
1158
1084
first_key = key
1159
1085
absent, references, value = elements[-3:]
1161
for ref_string in references.split(b'\t'):
1087
for ref_string in references.split('\t'):
1162
1088
ref_lists.append(tuple([
1163
int(ref) for ref in ref_string.split(b'\r') if ref
1089
int(ref) for ref in ref_string.split('\r') if ref
1165
1091
ref_lists = tuple(ref_lists)
1166
1092
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
1167
pos += len(line) + 1 # +1 for the \n
1093
pos += len(line) + 1 # +1 for the \n
1170
1096
if self.node_ref_lists:
1199
1125
# combine two regions
1200
1126
if (index + 1 < len(self._parsed_byte_map) and
1201
1127
self._parsed_byte_map[index][1] == start and
1202
self._parsed_byte_map[index + 1][0] == end):
1128
self._parsed_byte_map[index + 1][0] == end):
1203
1129
# combine two regions
1204
1130
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
1205
self._parsed_byte_map[index + 1][1])
1131
self._parsed_byte_map[index + 1][1])
1206
1132
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
1207
self._parsed_key_map[index + 1][1])
1133
self._parsed_key_map[index + 1][1])
1208
1134
del self._parsed_byte_map[index + 1]
1209
1135
del self._parsed_key_map[index + 1]
1210
1136
elif self._parsed_byte_map[index][1] == start:
1214
1140
self._parsed_key_map[index] = (
1215
1141
self._parsed_key_map[index][0], end_key)
1216
1142
elif (index + 1 < len(self._parsed_byte_map) and
1217
self._parsed_byte_map[index + 1][0] == end):
1143
self._parsed_byte_map[index + 1][0] == end):
1218
1144
# extend the higher entry
1219
1145
self._parsed_byte_map[index + 1] = (
1220
1146
start, self._parsed_byte_map[index + 1][1])
1238
1164
self._buffer_all()
1241
base_offset = self._base_offset
1242
if base_offset != 0:
1243
# Rewrite the ranges for the offset
1244
readv_ranges = [(start + base_offset, size)
1245
for start, size in readv_ranges]
1246
1167
readv_data = self._transport.readv(self._name, readv_ranges, True,
1247
self._size + self._base_offset)
1249
1170
for offset, data in readv_data:
1250
offset -= base_offset
1251
1171
self._bytes_read += len(data)
1253
# transport.readv() expanded to extra data which isn't part of
1255
data = data[-offset:]
1257
1172
if offset == 0 and len(data) == self._size:
1258
1173
# We read the whole range, most likely because the
1259
1174
# Transport upcast our readv ranges into one long request
1260
1175
# for enough total data to grab the whole index.
1261
self._buffer_all(BytesIO(data))
1176
self._buffer_all(StringIO(data))
1263
1178
if self._bisect_nodes is None:
1264
1179
# this must be the start
1288
1203
Queries against the combined index will be made against the first index,
1289
and then the second and so on. The order of indices can thus influence
1204
and then the second and so on. The order of index's can thus influence
1290
1205
performance significantly. For example, if one index is on local disk and a
1291
1206
second on a remote server, the local disk index should be before the other
1292
1207
in the index list.
1294
Also, queries tend to need results from the same indices as previous
1295
queries. So the indices will be reordered after every query to put the
1296
indices that had the result(s) of that query first (while otherwise
1297
preserving the relative ordering).
1300
1210
def __init__(self, indices, reload_func=None):
1308
1218
self._indices = indices
1309
1219
self._reload_func = reload_func
1310
# Sibling indices are other CombinedGraphIndex that we should call
1311
# _move_to_front_by_name on when we auto-reorder ourself.
1312
self._sibling_indices = []
1313
# A list of names that corresponds to the instances in self._indices,
1314
# so _index_names[0] is always the name for _indices[0], etc. Sibling
1315
# indices must all use the same set of names as each other.
1316
self._index_names = [None] * len(self._indices)
1318
1221
def __repr__(self):
1319
1222
return "%s(%s)" % (
1320
self.__class__.__name__,
1321
', '.join(map(repr, self._indices)))
1323
def clear_cache(self):
1324
"""See GraphIndex.clear_cache()"""
1325
for index in self._indices:
1223
self.__class__.__name__,
1224
', '.join(map(repr, self._indices)))
1328
1226
def get_parent_map(self, keys):
1329
1227
"""See graph.StackedParentsProvider.get_parent_map"""
1330
1228
search_keys = set(keys)
1331
if _mod_revision.NULL_REVISION in search_keys:
1332
search_keys.discard(_mod_revision.NULL_REVISION)
1333
found_parents = {_mod_revision.NULL_REVISION: []}
1229
if NULL_REVISION in search_keys:
1230
search_keys.discard(NULL_REVISION)
1231
found_parents = {NULL_REVISION:[]}
1335
1233
found_parents = {}
1336
1234
for index, key, value, refs in self.iter_entries(search_keys):
1337
1235
parents = refs[0]
1338
1236
if not parents:
1339
parents = (_mod_revision.NULL_REVISION,)
1237
parents = (NULL_REVISION,)
1340
1238
found_parents[key] = parents
1341
1239
return found_parents
1343
__contains__ = _has_key_from_parent_map
1241
has_key = _has_key_from_parent_map
1345
def insert_index(self, pos, index, name=None):
1243
def insert_index(self, pos, index):
1346
1244
"""Insert a new index in the list of indices to query.
1348
1246
:param pos: The position to insert the index.
1349
1247
:param index: The index to insert.
1350
:param name: a name for this index, e.g. a pack name. These names can
1351
be used to reflect index reorderings to related CombinedGraphIndex
1352
instances that use the same names. (see set_sibling_indices)
1354
1249
self._indices.insert(pos, index)
1355
self._index_names.insert(pos, name)
1357
1251
def iter_all_entries(self):
1358
1252
"""Iterate over all keys within the index
1384
1277
value and are only reported once.
1386
1279
:param keys: An iterable providing the keys to be retrieved.
1387
:return: An iterable of (index, key, reference_lists, value). There is
1388
no defined order for the result iteration - it will be in the most
1280
:return: An iterable of (index, key, reference_lists, value). There is no
1281
defined order for the result iteration - it will be in the most
1389
1282
efficient order for the index.
1391
1284
keys = set(keys)
1395
1287
for index in self._indices:
1399
1290
for node in index.iter_entries(keys):
1400
1291
keys.remove(node[1])
1404
hit_indices.append(index)
1406
except errors.NoSuchFile as e:
1407
if not self._try_reload(e):
1409
self._move_to_front(hit_indices)
1294
except errors.NoSuchFile:
1295
self._reload_or_raise()
1411
1297
def iter_entries_prefix(self, keys):
1412
1298
"""Iterate over keys within the index using prefix matching.
1434
1320
seen_keys = set()
1438
1323
for index in self._indices:
1440
1324
for node in index.iter_entries_prefix(keys):
1441
1325
if node[1] in seen_keys:
1443
1327
seen_keys.add(node[1])
1447
hit_indices.append(index)
1449
except errors.NoSuchFile as e:
1450
if not self._try_reload(e):
1452
self._move_to_front(hit_indices)
1454
def _move_to_front(self, hit_indices):
1455
"""Rearrange self._indices so that hit_indices are first.
1457
Order is maintained as much as possible, e.g. the first unhit index
1458
will be the first index in _indices after the hit_indices, and the
1459
hit_indices will be present in exactly the order they are passed to
1462
_move_to_front propagates to all objects in self._sibling_indices by
1463
calling _move_to_front_by_name.
1465
if self._indices[:len(hit_indices)] == hit_indices:
1466
# The 'hit_indices' are already at the front (and in the same
1467
# order), no need to re-order
1469
hit_names = self._move_to_front_by_index(hit_indices)
1470
for sibling_idx in self._sibling_indices:
1471
sibling_idx._move_to_front_by_name(hit_names)
1473
def _move_to_front_by_index(self, hit_indices):
1474
"""Core logic for _move_to_front.
1476
Returns a list of names corresponding to the hit_indices param.
1478
indices_info = zip(self._index_names, self._indices)
1479
if 'index' in debug.debug_flags:
1480
indices_info = list(indices_info)
1481
trace.mutter('CombinedGraphIndex reordering: currently %r, '
1482
'promoting %r', indices_info, hit_indices)
1485
new_hit_indices = []
1488
for offset, (name, idx) in enumerate(indices_info):
1489
if idx in hit_indices:
1490
hit_names.append(name)
1491
new_hit_indices.append(idx)
1492
if len(new_hit_indices) == len(hit_indices):
1493
# We've found all of the hit entries, everything else is
1495
unhit_names.extend(self._index_names[offset + 1:])
1496
unhit_indices.extend(self._indices[offset + 1:])
1499
unhit_names.append(name)
1500
unhit_indices.append(idx)
1502
self._indices = new_hit_indices + unhit_indices
1503
self._index_names = hit_names + unhit_names
1504
if 'index' in debug.debug_flags:
1505
trace.mutter('CombinedGraphIndex reordered: %r', self._indices)
1508
def _move_to_front_by_name(self, hit_names):
1509
"""Moves indices named by 'hit_names' to front of the search order, as
1510
described in _move_to_front.
1512
# Translate names to index instances, and then call
1513
# _move_to_front_by_index.
1514
indices_info = zip(self._index_names, self._indices)
1516
for name, idx in indices_info:
1517
if name in hit_names:
1518
hit_indices.append(idx)
1519
self._move_to_front_by_index(hit_indices)
1330
except errors.NoSuchFile:
1331
self._reload_or_raise()
1521
1333
def find_ancestry(self, keys, ref_list_num):
1522
1334
"""Find the complete ancestry for the given set of keys.
1561
1372
# CombinedGraphIndex does not know what the ref lists
1563
1374
search_keys = index._find_ancestors(search_keys,
1564
ref_list_num, parent_map, index_missing_keys)
1375
ref_list_num, parent_map, index_missing_keys)
1565
1376
# print ' \t \t%2d\t%4d\t%5d\t%5d' % (
1566
1377
# sub_generation, len(search_keys),
1567
1378
# len(parent_map), len(index_missing_keys))
1595
1406
return sum((index.key_count() for index in self._indices), 0)
1596
except errors.NoSuchFile as e:
1597
if not self._try_reload(e):
1407
except errors.NoSuchFile:
1408
self._reload_or_raise()
1600
1410
missing_keys = _missing_keys_from_parent_map
1602
def _try_reload(self, error):
1412
def _reload_or_raise(self):
1603
1413
"""We just got a NoSuchFile exception.
1605
1415
Try to reload the indices, if it fails, just raise the current
1608
1418
if self._reload_func is None:
1611
'Trying to reload after getting exception: %s', str(error))
1420
exc_type, exc_value, exc_traceback = sys.exc_info()
1421
trace.mutter('Trying to reload after getting exception: %s',
1612
1423
if not self._reload_func():
1613
1424
# We tried to reload, but nothing changed, so we fail anyway
1614
1425
trace.mutter('_reload_func indicated nothing has changed.'
1615
1426
' Raising original exception.')
1619
def set_sibling_indices(self, sibling_combined_graph_indices):
1620
"""Set the CombinedGraphIndex objects to reorder after reordering self.
1622
self._sibling_indices = sibling_combined_graph_indices
1427
raise exc_type, exc_value, exc_traceback
1624
1429
def validate(self):
1625
1430
"""Validate that everything in the index can be accessed."""
1663
1467
if 'evil' in debug.debug_flags:
1664
1468
trace.mutter_callsite(3,
1665
"iter_all_entries scales with size of history.")
1469
"iter_all_entries scales with size of history.")
1666
1470
if self.reference_lists:
1667
for key, (absent, references, value) in self._nodes.items():
1471
for key, (absent, references, value) in self._nodes.iteritems():
1669
1473
yield self, key, value, references
1671
for key, (absent, references, value) in self._nodes.items():
1475
for key, (absent, references, value) in self._nodes.iteritems():
1673
1477
yield self, key, value
1680
1484
defined order for the result iteration - it will be in the most
1681
1485
efficient order for the index (keys iteration order in this case).
1683
# Note: See BTreeBuilder.iter_entries for an explanation of why we
1684
# aren't using set().intersection() here
1686
keys = [key for key in keys if key in nodes]
1687
1488
if self.reference_lists:
1489
for key in keys.intersection(self._keys):
1490
node = self._nodes[key]
1690
1491
if not node[0]:
1691
1492
yield self, key, node[2], node[1]
1494
for key in keys.intersection(self._keys):
1495
node = self._nodes[key]
1695
1496
if not node[0]:
1696
1497
yield self, key, node[2]
1712
1513
will be returned, and every match that is in the index will be
1516
# XXX: To much duplication with the GraphIndex class; consider finding
1517
# a good place to pull out the actual common logic.
1715
1518
keys = set(keys)
1718
1521
if self._key_length == 1:
1719
1522
for key in keys:
1720
_sanity_check_key(self, key)
1525
raise errors.BadIndexKey(key)
1526
if len(key) != self._key_length:
1527
raise errors.BadIndexKey(key)
1721
1528
node = self._nodes[key]
1727
1534
yield self, key, node[2]
1729
1536
nodes_by_key = self._get_nodes_by_key()
1730
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
1540
raise errors.BadIndexKey(key)
1541
if len(key) != self._key_length:
1542
raise errors.BadIndexKey(key)
1543
# find what it refers to:
1544
key_dict = nodes_by_key
1545
elements = list(key)
1546
# find the subdict to return
1548
while len(elements) and elements[0] is not None:
1549
key_dict = key_dict[elements[0]]
1552
# a non-existant lookup.
1557
key_dict = dicts.pop(-1)
1558
# can't be empty or would not exist
1559
item, value = key_dict.iteritems().next()
1560
if type(value) == dict:
1562
dicts.extend(key_dict.itervalues())
1565
for value in key_dict.itervalues():
1566
yield (self, ) + value
1568
yield (self, ) + key_dict
1733
1570
def key_count(self):
1734
1571
"""Return an estimate of the number of keys in this index.
1736
1573
For InMemoryGraphIndex the estimate is exact.
1738
return len(self._nodes) - len(self._absent_keys)
1575
return len(self._keys)
1740
1577
def validate(self):
1741
1578
"""In memory index's have no known corruption at the moment."""
1743
def __lt__(self, other):
1744
# We don't really care about the order, just that there is an order.
1745
if (not isinstance(other, GraphIndex) and
1746
not isinstance(other, InMemoryGraphIndex)):
1747
raise TypeError(other)
1748
return hash(self) < hash(other)
1751
1581
class GraphIndexPrefixAdapter(object):
1752
1582
"""An adapter between GraphIndex with different key lengths.
1761
1591
def __init__(self, adapted, prefix, missing_key_length,
1762
add_nodes_callback=None):
1592
add_nodes_callback=None):
1763
1593
"""Construct an adapter against adapted with prefix."""
1764
1594
self.adapted = adapted
1765
self.prefix_key = prefix + (None,) * missing_key_length
1595
self.prefix_key = prefix + (None,)*missing_key_length
1766
1596
self.prefix = prefix
1767
1597
self.prefix_len = len(prefix)
1768
1598
self.add_nodes_callback = add_nodes_callback
1781
1611
for (key, value, node_refs) in nodes:
1782
1612
adjusted_references = (
1783
1613
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1784
for ref_list in node_refs))
1614
for ref_list in node_refs))
1785
1615
translated_nodes.append((self.prefix + key, value,
1786
adjusted_references))
1616
adjusted_references))
1787
1617
except ValueError:
1788
1618
# XXX: TODO add an explicit interface for getting the reference list
1789
1619
# status, to handle this bit of user-friendliness in the API more
1810
1640
for node in an_iter:
1812
1642
if node[1][:self.prefix_len] != self.prefix:
1813
raise BadIndexData(self)
1643
raise errors.BadIndexData(self)
1814
1644
for ref_list in node[3]:
1815
1645
for ref_node in ref_list:
1816
1646
if ref_node[:self.prefix_len] != self.prefix:
1817
raise BadIndexData(self)
1647
raise errors.BadIndexData(self)
1818
1648
yield node[0], node[1][self.prefix_len:], node[2], (
1819
1649
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1820
for ref_list in node[3]))
1650
for ref_list in node[3]))
1822
1652
def iter_all_entries(self):
1823
1653
"""Iterate over all keys within the index
1873
1703
def validate(self):
1874
1704
"""Call the adapted's validate."""
1875
1705
self.adapted.validate()
1878
def _sanity_check_key(index_or_builder, key):
1879
"""Raise BadIndexKey if key cannot be used for prefix matching."""
1881
raise BadIndexKey(key)
1882
if len(key) != index_or_builder._key_length:
1883
raise BadIndexKey(key)
1886
def _iter_entries_prefix(index_or_builder, nodes_by_key, keys):
1887
"""Helper for implementing prefix matching iterators."""
1889
_sanity_check_key(index_or_builder, key)
1890
# find what it refers to:
1891
key_dict = nodes_by_key
1892
elements = list(key)
1893
# find the subdict whose contents should be returned.
1895
while len(elements) and elements[0] is not None:
1896
key_dict = key_dict[elements[0]]
1899
# a non-existant lookup.
1904
values_view = dicts.pop().values()
1905
# can't be empty or would not exist
1906
value = next(iter(values_view))
1907
if isinstance(value, dict):
1908
# still descending, push values
1909
dicts.extend(values_view)
1911
# at leaf tuples, yield values
1912
for value in values_view:
1913
# each value is the key:value:node refs tuple
1915
yield (index_or_builder, ) + value
1917
# the last thing looked up was a terminal element
1918
yield (index_or_builder, ) + key_dict