27
29
from bisect import bisect_right
28
from cStringIO import StringIO
32
from bzrlib.lazy_import import lazy_import
32
from ..lazy_import import lazy_import
33
33
lazy_import(globals(), """
34
from bzrlib import trace
35
from bzrlib.bisect_multi import bisect_multi_bytes
36
from bzrlib.revision import NULL_REVISION
37
from bzrlib.trace import mutter
36
revision as _mod_revision,
43
from bzrlib.static_tuple import StaticTuple
44
from ..sixish import (
49
from ..static_tuple import StaticTuple
45
51
_HEADER_READV = (0, 200)
46
_OPTION_KEY_ELEMENTS = "key_elements="
48
_OPTION_NODE_REFS = "node_ref_lists="
49
_SIGNATURE = "Bazaar Graph Index 1\n"
52
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
53
_newline_null_re = re.compile('[\n\0]')
52
_OPTION_KEY_ELEMENTS = b"key_elements="
54
_OPTION_NODE_REFS = b"node_ref_lists="
55
_SIGNATURE = b"Bazaar Graph Index 1\n"
58
class BadIndexFormatSignature(errors.BzrError):
60
_fmt = "%(value)s is not an index of type %(_type)s."
62
def __init__(self, value, _type):
63
errors.BzrError.__init__(self)
68
class BadIndexData(errors.BzrError):
70
_fmt = "Error in data for index %(value)s."
72
def __init__(self, value):
73
errors.BzrError.__init__(self)
77
class BadIndexDuplicateKey(errors.BzrError):
79
_fmt = "The key '%(key)s' is already in index '%(index)s'."
81
def __init__(self, key, index):
82
errors.BzrError.__init__(self)
87
class BadIndexKey(errors.BzrError):
89
_fmt = "The key '%(key)s' is not a valid key."
91
def __init__(self, key):
92
errors.BzrError.__init__(self)
96
class BadIndexOptions(errors.BzrError):
98
_fmt = "Could not parse options for index %(value)s."
100
def __init__(self, value):
101
errors.BzrError.__init__(self)
105
class BadIndexValue(errors.BzrError):
107
_fmt = "The value '%(value)s' is not a valid value."
109
def __init__(self, value):
110
errors.BzrError.__init__(self)
114
_whitespace_re = re.compile(b'[\t\n\x0b\x0c\r\x00 ]')
115
_newline_null_re = re.compile(b'[\n\0]')
56
118
def _has_key_from_parent_map(self, key):
69
131
class GraphIndexBuilder(object):
70
132
"""A builder that can build a GraphIndex.
72
The resulting graph has the structure:
134
The resulting graph has the structure::
74
_SIGNATURE OPTIONS NODES NEWLINE
75
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
76
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
78
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
79
KEY := Not-whitespace-utf8
81
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
82
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
83
REFERENCE := DIGITS ; digits is the byte offset in the index of the
85
VALUE := no-newline-no-null-bytes
136
_SIGNATURE OPTIONS NODES NEWLINE
137
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
138
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
140
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
141
KEY := Not-whitespace-utf8
143
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
144
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
145
REFERENCE := DIGITS ; digits is the byte offset in the index of the
147
VALUE := no-newline-no-null-bytes
88
150
def __init__(self, reference_lists=0, key_elements=1):
105
167
def _check_key(self, key):
106
168
"""Raise BadIndexKey if key is not a valid key for this index."""
107
169
if type(key) not in (tuple, StaticTuple):
108
raise errors.BadIndexKey(key)
170
raise BadIndexKey(key)
109
171
if self._key_length != len(key):
110
raise errors.BadIndexKey(key)
172
raise BadIndexKey(key)
111
173
for element in key:
112
174
if not element or _whitespace_re.search(element) is not None:
113
raise errors.BadIndexKey(element)
175
raise BadIndexKey(element)
115
177
def _external_references(self):
116
178
"""Return references that are not present in this index.
146
208
key_dict = key_dict.setdefault(subkey, {})
147
209
key_dict[key[-1]] = key, value, references
149
for key, (absent, references, value) in self._nodes.iteritems():
211
for key, (absent, references, value) in viewitems(self._nodes):
152
214
key_dict = nodes_by_key
184
246
:param value: The value associate with this key. Must not contain
185
247
newlines or null characters.
186
248
:return: (node_refs, absent_references)
187
node_refs basically a packed form of 'references' where all
189
absent_references reference keys that are not in self._nodes.
190
This may contain duplicates if the same key is
191
referenced in multiple lists.
250
* node_refs: basically a packed form of 'references' where all
252
* absent_references: reference keys that are not in self._nodes.
253
This may contain duplicates if the same key is referenced in
193
256
as_st = StaticTuple.from_sequence
194
257
self._check_key(key)
195
258
if _newline_null_re.search(value) is not None:
196
raise errors.BadIndexValue(value)
259
raise BadIndexValue(value)
197
260
if len(references) != self.reference_lists:
198
raise errors.BadIndexValue(references)
261
raise BadIndexValue(references)
200
263
absent_references = []
201
264
for reference_list in references:
219
282
:param references: An iterable of iterables of keys. Each is a
220
283
reference to another key.
221
284
:param value: The value to associate with the key. It may be any
222
bytes as long as it does not contain \0 or \n.
285
bytes as long as it does not contain \\0 or \\n.
225
288
absent_references) = self._check_key_ref_value(key, references, value)
226
289
if key in self._nodes and self._nodes[key][0] != 'a':
227
raise errors.BadIndexDuplicateKey(key, self)
290
raise BadIndexDuplicateKey(key, self)
228
291
for reference in absent_references:
229
292
# There may be duplicates, but I don't think it is worth worrying
245
308
def finish(self):
311
:returns: cBytesIO holding the full context of the index as it
312
should be written to disk.
246
314
lines = [_SIGNATURE]
247
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
248
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
315
lines.append(b'%s%d\n' % (_OPTION_NODE_REFS, self.reference_lists))
316
lines.append(b'%s%d\n' % (_OPTION_KEY_ELEMENTS, self._key_length))
249
317
key_count = len(self._nodes) - len(self._absent_keys)
250
lines.append(_OPTION_LEN + str(key_count) + '\n')
318
lines.append(b'%s%d\n' % (_OPTION_LEN, key_count))
251
319
prefix_length = sum(len(x) for x in lines)
252
320
# references are byte offsets. To avoid having to do nasty
253
321
# polynomial work to resolve offsets (references to later in the
264
332
# forward sorted by key. In future we may consider topological sorting,
265
333
# at the cost of table scans for direct lookup, or a second index for
267
nodes = sorted(self._nodes.items())
335
nodes = sorted(viewitems(self._nodes))
268
336
# if we do not prepass, we don't know how long it will be up front.
269
337
expected_bytes = None
270
338
# we only need to pre-pass if we have reference lists at all.
310
378
for key, non_ref_bytes, total_references in key_offset_info:
311
379
key_addresses[key] = non_ref_bytes + total_references*digits
313
format_string = '%%0%sd' % digits
381
format_string = b'%%0%dd' % digits
314
382
for key, (absent, references, value) in nodes:
315
383
flattened_references = []
316
384
for ref_list in references:
317
385
ref_addresses = []
318
386
for reference in ref_list:
319
387
ref_addresses.append(format_string % key_addresses[reference])
320
flattened_references.append('\r'.join(ref_addresses))
321
string_key = '\x00'.join(key)
322
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
323
'\t'.join(flattened_references), value))
325
result = StringIO(''.join(lines))
388
flattened_references.append(b'\r'.join(ref_addresses))
389
string_key = b'\x00'.join(key)
390
lines.append(b"%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
391
b'\t'.join(flattened_references), value))
393
result = BytesIO(b''.join(lines))
326
394
if expected_bytes and len(result.getvalue()) != expected_bytes:
327
395
raise errors.BzrError('Failed index creation. Internal error:'
328
396
' mismatched output length and expected length: %d %d' %
385
453
def __init__(self, transport, name, size, unlimited_cache=False, offset=0):
386
454
"""Open an index called name on transport.
388
:param transport: A bzrlib.transport.Transport.
456
:param transport: A breezy.transport.Transport.
389
457
:param name: A path to provide to transport API calls.
390
458
:param size: The size of the index in bytes. This is used for bisection
391
459
logic to perform partial index reads. While the size could be
444
512
# We already did this
446
514
if 'index' in debug.debug_flags:
447
mutter('Reading entire index %s', self._transport.abspath(self._name))
515
trace.mutter('Reading entire index %s',
516
self._transport.abspath(self._name))
448
517
if stream is None:
449
518
stream = self._transport.get(self._name)
450
519
if self._base_offset != 0:
451
520
# This is wasteful, but it is better than dealing with
452
521
# adjusting all the offsets, etc.
453
stream = StringIO(stream.read()[self._base_offset:])
522
stream = BytesIO(stream.read()[self._base_offset:])
454
523
self._read_prefix(stream)
455
524
self._expected_elements = 3 + self._key_length
463
532
pos = stream.tell()
464
533
lines = stream.read().split('\n')
534
# GZ 2009-09-20: Should really use a try/finally block to ensure close
467
537
_, _, _, trailers = self._parse_lines(lines, pos)
468
for key, absent, references, value in self._keys_by_offset.itervalues():
538
for key, absent, references, value in viewvalues(self._keys_by_offset):
471
541
# resolve references:
496
566
% (ref_list_num, self.node_ref_lists))
498
568
nodes = self._nodes
499
for key, (value, ref_lists) in nodes.iteritems():
569
for key, (value, ref_lists) in viewitems(nodes):
500
570
ref_list = ref_lists[ref_list_num]
501
571
refs.update([ref for ref in ref_list if ref not in nodes])
505
575
if self._nodes_by_key is None:
506
576
nodes_by_key = {}
507
577
if self.node_ref_lists:
508
for key, (value, references) in self._nodes.iteritems():
578
for key, (value, references) in viewitems(self._nodes):
509
579
key_dict = nodes_by_key
510
580
for subkey in key[:-1]:
511
581
key_dict = key_dict.setdefault(subkey, {})
512
582
key_dict[key[-1]] = key, value, references
514
for key, value in self._nodes.iteritems():
584
for key, value in viewitems(self._nodes):
515
585
key_dict = nodes_by_key
516
586
for subkey in key[:-1]:
517
587
key_dict = key_dict.setdefault(subkey, {})
534
604
if self._nodes is None:
535
605
self._buffer_all()
536
606
if self.node_ref_lists:
537
for key, (value, node_ref_lists) in self._nodes.iteritems():
607
for key, (value, node_ref_lists) in viewitems(self._nodes):
538
608
yield self, key, value, node_ref_lists
540
for key, value in self._nodes.iteritems():
610
for key, value in viewitems(self._nodes):
541
611
yield self, key, value
543
613
def _read_prefix(self, stream):
544
614
signature = stream.read(len(self._signature()))
545
615
if not signature == self._signature():
546
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
616
raise BadIndexFormatSignature(self._name, GraphIndex)
547
617
options_line = stream.readline()
548
618
if not options_line.startswith(_OPTION_NODE_REFS):
549
raise errors.BadIndexOptions(self)
619
raise BadIndexOptions(self)
551
621
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
552
622
except ValueError:
553
raise errors.BadIndexOptions(self)
623
raise BadIndexOptions(self)
554
624
options_line = stream.readline()
555
625
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
556
raise errors.BadIndexOptions(self)
626
raise BadIndexOptions(self)
558
628
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
559
629
except ValueError:
560
raise errors.BadIndexOptions(self)
630
raise BadIndexOptions(self)
561
631
options_line = stream.readline()
562
632
if not options_line.startswith(_OPTION_LEN):
563
raise errors.BadIndexOptions(self)
633
raise BadIndexOptions(self)
565
635
self._key_count = int(options_line[len(_OPTION_LEN):-1])
566
636
except ValueError:
567
raise errors.BadIndexOptions(self)
637
raise BadIndexOptions(self)
569
639
def _resolve_references(self, references):
570
640
"""Return the resolved key references for references.
670
740
if self._nodes is not None:
671
741
return self._iter_entries_from_total_buffer(keys)
673
return (result[1] for result in bisect_multi_bytes(
743
return (result[1] for result in bisect_multi.bisect_multi_bytes(
674
744
self._lookup_keys_via_location, self._size, keys))
676
746
def iter_entries_prefix(self, keys):
703
773
self._buffer_all()
704
774
if self._key_length == 1:
708
raise errors.BadIndexKey(key)
709
if len(key) != self._key_length:
710
raise errors.BadIndexKey(key)
776
_sanity_check_key(self, key)
711
777
if self.node_ref_lists:
712
778
value, node_refs = self._nodes[key]
713
779
yield self, key, value, node_refs
715
781
yield self, key, self._nodes[key]
717
783
nodes_by_key = self._get_nodes_by_key()
721
raise errors.BadIndexKey(key)
722
if len(key) != self._key_length:
723
raise errors.BadIndexKey(key)
724
# find what it refers to:
725
key_dict = nodes_by_key
727
# find the subdict whose contents should be returned.
729
while len(elements) and elements[0] is not None:
730
key_dict = key_dict[elements[0]]
733
# a non-existant lookup.
738
key_dict = dicts.pop(-1)
739
# can't be empty or would not exist
740
item, value = key_dict.iteritems().next()
741
if type(value) == dict:
743
dicts.extend(key_dict.itervalues())
746
for value in key_dict.itervalues():
747
# each value is the key:value:node refs tuple
749
yield (self, ) + value
751
# the last thing looked up was a terminal element
752
yield (self, ) + key_dict
784
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
754
787
def _find_ancestors(self, keys, ref_list_num, parent_map, missing_keys):
755
788
"""See BTreeIndex._find_ancestors."""
929
962
signature = bytes[0:len(self._signature())]
930
963
if not signature == self._signature():
931
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
964
raise BadIndexFormatSignature(self._name, GraphIndex)
932
965
lines = bytes[len(self._signature()):].splitlines()
933
966
options_line = lines[0]
934
967
if not options_line.startswith(_OPTION_NODE_REFS):
935
raise errors.BadIndexOptions(self)
968
raise BadIndexOptions(self)
937
970
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
938
971
except ValueError:
939
raise errors.BadIndexOptions(self)
972
raise BadIndexOptions(self)
940
973
options_line = lines[1]
941
974
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
942
raise errors.BadIndexOptions(self)
975
raise BadIndexOptions(self)
944
977
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
945
978
except ValueError:
946
raise errors.BadIndexOptions(self)
979
raise BadIndexOptions(self)
947
980
options_line = lines[2]
948
981
if not options_line.startswith(_OPTION_LEN):
949
raise errors.BadIndexOptions(self)
982
raise BadIndexOptions(self)
951
984
self._key_count = int(options_line[len(_OPTION_LEN):])
952
985
except ValueError:
953
raise errors.BadIndexOptions(self)
986
raise BadIndexOptions(self)
954
987
# calculate the bytes we have processed
955
988
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
956
989
len(lines[2]) + 3)
1110
1143
elements = line.split('\0')
1111
1144
if len(elements) != self._expected_elements:
1112
raise errors.BadIndexData(self)
1145
raise BadIndexData(self)
1113
1146
# keys are tuples. Each element is a string that may occur many
1114
1147
# times, so we intern them to save space. AB, RC, 200807
1115
1148
key = tuple([intern(element) for element in elements[:self._key_length]])
1217
1250
# We read the whole range, most likely because the
1218
1251
# Transport upcast our readv ranges into one long request
1219
1252
# for enough total data to grab the whole index.
1220
self._buffer_all(StringIO(data))
1253
self._buffer_all(BytesIO(data))
1222
1255
if self._bisect_nodes is None:
1223
1256
# this must be the start
1287
1320
def get_parent_map(self, keys):
1288
1321
"""See graph.StackedParentsProvider.get_parent_map"""
1289
1322
search_keys = set(keys)
1290
if NULL_REVISION in search_keys:
1291
search_keys.discard(NULL_REVISION)
1292
found_parents = {NULL_REVISION:[]}
1323
if _mod_revision.NULL_REVISION in search_keys:
1324
search_keys.discard(_mod_revision.NULL_REVISION)
1325
found_parents = {_mod_revision.NULL_REVISION:[]}
1294
1327
found_parents = {}
1295
1328
for index, key, value, refs in self.iter_entries(search_keys):
1296
1329
parents = refs[0]
1297
1330
if not parents:
1298
parents = (NULL_REVISION,)
1331
parents = (_mod_revision.NULL_REVISION,)
1299
1332
found_parents[key] = parents
1300
1333
return found_parents
1302
has_key = _has_key_from_parent_map
1335
__contains__ = _has_key_from_parent_map
1304
1337
def insert_index(self, pos, index, name=None):
1305
1338
"""Insert a new index in the list of indices to query.
1434
1470
indices_info = zip(self._index_names, self._indices)
1435
1471
if 'index' in debug.debug_flags:
1436
mutter('CombinedGraphIndex reordering: currently %r, promoting %r',
1437
indices_info, hit_indices)
1472
indices_info = list(indices_info)
1473
trace.mutter('CombinedGraphIndex reordering: currently %r, '
1474
'promoting %r', indices_info, hit_indices)
1439
1476
unhit_names = []
1440
1477
new_hit_indices = []
1457
1494
self._indices = new_hit_indices + unhit_indices
1458
1495
self._index_names = hit_names + unhit_names
1459
1496
if 'index' in debug.debug_flags:
1460
mutter('CombinedGraphIndex reordered: %r', self._indices)
1497
trace.mutter('CombinedGraphIndex reordered: %r', self._indices)
1461
1498
return hit_names
1463
1500
def _move_to_front_by_name(self, hit_names):
1550
1587
return sum((index.key_count() for index in self._indices), 0)
1551
except errors.NoSuchFile:
1552
self._reload_or_raise()
1588
except errors.NoSuchFile as e:
1589
if not self._try_reload(e):
1554
1592
missing_keys = _missing_keys_from_parent_map
1556
def _reload_or_raise(self):
1594
def _try_reload(self, error):
1557
1595
"""We just got a NoSuchFile exception.
1559
1597
Try to reload the indices, if it fails, just raise the current
1562
1600
if self._reload_func is None:
1564
exc_type, exc_value, exc_traceback = sys.exc_info()
1565
trace.mutter('Trying to reload after getting exception: %s',
1602
trace.mutter('Trying to reload after getting exception: %s', error)
1567
1603
if not self._reload_func():
1568
1604
# We tried to reload, but nothing changed, so we fail anyway
1569
1605
trace.mutter('_reload_func indicated nothing has changed.'
1570
1606
' Raising original exception.')
1571
raise exc_type, exc_value, exc_traceback
1573
1610
def set_sibling_indices(self, sibling_combined_graph_indices):
1574
1611
"""Set the CombinedGraphIndex objects to reorder after reordering self.
1617
1655
trace.mutter_callsite(3,
1618
1656
"iter_all_entries scales with size of history.")
1619
1657
if self.reference_lists:
1620
for key, (absent, references, value) in self._nodes.iteritems():
1658
for key, (absent, references, value) in viewitems(self._nodes):
1622
1660
yield self, key, value, references
1624
for key, (absent, references, value) in self._nodes.iteritems():
1662
for key, (absent, references, value) in viewitems(self._nodes):
1626
1664
yield self, key, value
1665
1703
will be returned, and every match that is in the index will be
1668
# XXX: To much duplication with the GraphIndex class; consider finding
1669
# a good place to pull out the actual common logic.
1670
1706
keys = set(keys)
1673
1709
if self._key_length == 1:
1674
1710
for key in keys:
1677
raise errors.BadIndexKey(key)
1678
if len(key) != self._key_length:
1679
raise errors.BadIndexKey(key)
1711
_sanity_check_key(self, key)
1680
1712
node = self._nodes[key]
1686
1718
yield self, key, node[2]
1688
1720
nodes_by_key = self._get_nodes_by_key()
1692
raise errors.BadIndexKey(key)
1693
if len(key) != self._key_length:
1694
raise errors.BadIndexKey(key)
1695
# find what it refers to:
1696
key_dict = nodes_by_key
1697
elements = list(key)
1698
# find the subdict to return
1700
while len(elements) and elements[0] is not None:
1701
key_dict = key_dict[elements[0]]
1704
# a non-existant lookup.
1709
key_dict = dicts.pop(-1)
1710
# can't be empty or would not exist
1711
item, value = key_dict.iteritems().next()
1712
if type(value) == dict:
1714
dicts.extend(key_dict.itervalues())
1717
for value in key_dict.itervalues():
1718
yield (self, ) + value
1720
yield (self, ) + key_dict
1721
for entry in _iter_entries_prefix(self, nodes_by_key, keys):
1722
1724
def key_count(self):
1723
1725
"""Return an estimate of the number of keys in this index.
1792
1794
for node in an_iter:
1794
1796
if node[1][:self.prefix_len] != self.prefix:
1795
raise errors.BadIndexData(self)
1797
raise BadIndexData(self)
1796
1798
for ref_list in node[3]:
1797
1799
for ref_node in ref_list:
1798
1800
if ref_node[:self.prefix_len] != self.prefix:
1799
raise errors.BadIndexData(self)
1801
raise BadIndexData(self)
1800
1802
yield node[0], node[1][self.prefix_len:], node[2], (
1801
1803
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1802
1804
for ref_list in node[3]))
1855
1857
def validate(self):
1856
1858
"""Call the adapted's validate."""
1857
1859
self.adapted.validate()
1862
def _sanity_check_key(index_or_builder, key):
1863
"""Raise BadIndexKey if key cannot be used for prefix matching."""
1865
raise BadIndexKey(key)
1866
if len(key) != index_or_builder._key_length:
1867
raise BadIndexKey(key)
1870
def _iter_entries_prefix(index_or_builder, nodes_by_key, keys):
1871
"""Helper for implementing prefix matching iterators."""
1873
_sanity_check_key(index_or_builder, key)
1874
# find what it refers to:
1875
key_dict = nodes_by_key
1876
elements = list(key)
1877
# find the subdict whose contents should be returned.
1879
while len(elements) and elements[0] is not None:
1880
key_dict = key_dict[elements[0]]
1883
# a non-existant lookup.
1888
values_view = viewvalues(dicts.pop())
1889
# can't be empty or would not exist
1890
value = next(iter(values_view))
1891
if isinstance(value, dict):
1892
# still descending, push values
1893
dicts.extend(values_view)
1895
# at leaf tuples, yield values
1896
for value in values_view:
1897
# each value is the key:value:node refs tuple
1899
yield (index_or_builder, ) + value
1901
# the last thing looked up was a terminal element
1902
yield (index_or_builder, ) + key_dict