1
# Copyright (C) 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Indexing facilities."""
23
'GraphIndexPrefixAdapter',
27
from bisect import bisect_right
28
from cStringIO import StringIO
31
from bzrlib.lazy_import import lazy_import
32
lazy_import(globals(), """
33
from bzrlib import trace
34
from bzrlib.bisect_multi import bisect_multi_bytes
35
from bzrlib.revision import NULL_REVISION
36
from bzrlib.trace import mutter
44
_HEADER_READV = (0, 200)
45
_OPTION_KEY_ELEMENTS = "key_elements="
47
_OPTION_NODE_REFS = "node_ref_lists="
48
_SIGNATURE = "Bazaar Graph Index 1\n"
51
_whitespace_re = re.compile('[\t\n\x0b\x0c\r\x00 ]')
52
_newline_null_re = re.compile('[\n\0]')
55
class GraphIndexBuilder(object):
56
"""A builder that can build a GraphIndex.
58
The resulting graph has the structure:
60
_SIGNATURE OPTIONS NODES NEWLINE
61
_SIGNATURE := 'Bazaar Graph Index 1' NEWLINE
62
OPTIONS := 'node_ref_lists=' DIGITS NEWLINE
64
NODE := KEY NULL ABSENT? NULL REFERENCES NULL VALUE NEWLINE
65
KEY := Not-whitespace-utf8
67
REFERENCES := REFERENCE_LIST (TAB REFERENCE_LIST){node_ref_lists - 1}
68
REFERENCE_LIST := (REFERENCE (CR REFERENCE)*)?
69
REFERENCE := DIGITS ; digits is the byte offset in the index of the
71
VALUE := no-newline-no-null-bytes
74
def __init__(self, reference_lists=0, key_elements=1):
75
"""Create a GraphIndex builder.
77
:param reference_lists: The number of node references lists for each
79
:param key_elements: The number of bytestrings in each key.
81
self.reference_lists = reference_lists
84
self._nodes_by_key = {}
85
self._key_length = key_elements
87
def _check_key(self, key):
88
"""Raise BadIndexKey if key is not a valid key for this index."""
89
if type(key) != tuple:
90
raise errors.BadIndexKey(key)
91
if self._key_length != len(key):
92
raise errors.BadIndexKey(key)
94
if not element or _whitespace_re.search(element) is not None:
95
raise errors.BadIndexKey(element)
97
def add_node(self, key, value, references=()):
98
"""Add a node to the index.
100
:param key: The key. keys are non-empty tuples containing
101
as many whitespace-free utf8 bytestrings as the key length
102
defined for this index.
103
:param references: An iterable of iterables of keys. Each is a
104
reference to another key.
105
:param value: The value to associate with the key. It may be any
106
bytes as long as it does not contain \0 or \n.
109
if _newline_null_re.search(value) is not None:
110
raise errors.BadIndexValue(value)
111
if len(references) != self.reference_lists:
112
raise errors.BadIndexValue(references)
114
for reference_list in references:
115
for reference in reference_list:
116
self._check_key(reference)
117
if reference not in self._nodes:
118
self._nodes[reference] = ('a', (), '')
119
node_refs.append(tuple(reference_list))
120
if key in self._nodes and self._nodes[key][0] == '':
121
raise errors.BadIndexDuplicateKey(key, self)
122
self._nodes[key] = ('', tuple(node_refs), value)
124
if self._key_length > 1:
125
key_dict = self._nodes_by_key
126
if self.reference_lists:
127
key_value = key, value, tuple(node_refs)
129
key_value = key, value
130
# possibly should do this on-demand, but it seems likely it is
132
# For a key of (foo, bar, baz) create
133
# _nodes_by_key[foo][bar][baz] = key_value
134
for subkey in key[:-1]:
135
key_dict = key_dict.setdefault(subkey, {})
136
key_dict[key[-1]] = key_value
140
lines.append(_OPTION_NODE_REFS + str(self.reference_lists) + '\n')
141
lines.append(_OPTION_KEY_ELEMENTS + str(self._key_length) + '\n')
142
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
143
prefix_length = sum(len(x) for x in lines)
144
# references are byte offsets. To avoid having to do nasty
145
# polynomial work to resolve offsets (references to later in the
146
# file cannot be determined until all the inbetween references have
147
# been calculated too) we pad the offsets with 0's to make them be
148
# of consistent length. Using binary offsets would break the trivial
150
# to calculate the width of zero's needed we do three passes:
151
# one to gather all the non-reference data and the number of references.
152
# one to pad all the data with reference-length and determine entry
156
# forward sorted by key. In future we may consider topological sorting,
157
# at the cost of table scans for direct lookup, or a second index for
159
nodes = sorted(self._nodes.items())
160
# if we do not prepass, we don't know how long it will be up front.
161
expected_bytes = None
162
# we only need to pre-pass if we have reference lists at all.
163
if self.reference_lists:
165
non_ref_bytes = prefix_length
167
# TODO use simple multiplication for the constants in this loop.
168
for key, (absent, references, value) in nodes:
169
# record the offset known *so far* for this key:
170
# the non reference bytes to date, and the total references to
171
# date - saves reaccumulating on the second pass
172
key_offset_info.append((key, non_ref_bytes, total_references))
173
# key is literal, value is literal, there are 3 null's, 1 NL
174
# key is variable length tuple, \x00 between elements
175
non_ref_bytes += sum(len(element) for element in key)
176
if self._key_length > 1:
177
non_ref_bytes += self._key_length - 1
178
# value is literal bytes, there are 3 null's, 1 NL.
179
non_ref_bytes += len(value) + 3 + 1
180
# one byte for absent if set.
183
elif self.reference_lists:
184
# (ref_lists -1) tabs
185
non_ref_bytes += self.reference_lists - 1
186
# (ref-1 cr's per ref_list)
187
for ref_list in references:
188
# how many references across the whole file?
189
total_references += len(ref_list)
190
# accrue reference separators
192
non_ref_bytes += len(ref_list) - 1
193
# how many digits are needed to represent the total byte count?
195
possible_total_bytes = non_ref_bytes + total_references*digits
196
while 10 ** digits < possible_total_bytes:
198
possible_total_bytes = non_ref_bytes + total_references*digits
199
expected_bytes = possible_total_bytes + 1 # terminating newline
200
# resolve key addresses.
202
for key, non_ref_bytes, total_references in key_offset_info:
203
key_addresses[key] = non_ref_bytes + total_references*digits
205
format_string = '%%0%sd' % digits
206
for key, (absent, references, value) in nodes:
207
flattened_references = []
208
for ref_list in references:
210
for reference in ref_list:
211
ref_addresses.append(format_string % key_addresses[reference])
212
flattened_references.append('\r'.join(ref_addresses))
213
string_key = '\x00'.join(key)
214
lines.append("%s\x00%s\x00%s\x00%s\n" % (string_key, absent,
215
'\t'.join(flattened_references), value))
217
result = StringIO(''.join(lines))
218
if expected_bytes and len(result.getvalue()) != expected_bytes:
219
raise errors.BzrError('Failed index creation. Internal error:'
220
' mismatched output length and expected length: %d %d' %
221
(len(result.getvalue()), expected_bytes))
225
class GraphIndex(object):
226
"""An index for data with embedded graphs.
228
The index maps keys to a list of key reference lists, and a value.
229
Each node has the same number of key reference lists. Each key reference
230
list can be empty or an arbitrary length. The value is an opaque NULL
231
terminated string without any newlines. The storage of the index is
232
hidden in the interface: keys and key references are always tuples of
233
bytestrings, never the internal representation (e.g. dictionary offsets).
235
It is presumed that the index will not be mutated - it is static data.
237
Successive iter_all_entries calls will read the entire index each time.
238
Additionally, iter_entries calls will read the index linearly until the
239
desired keys are found. XXX: This must be fixed before the index is
240
suitable for production use. :XXX
243
def __init__(self, transport, name, size):
244
"""Open an index called name on transport.
246
:param transport: A bzrlib.transport.Transport.
247
:param name: A path to provide to transport API calls.
248
:param size: The size of the index in bytes. This is used for bisection
249
logic to perform partial index reads. While the size could be
250
obtained by statting the file this introduced an additional round
251
trip as well as requiring stat'able transports, both of which are
252
avoided by having it supplied. If size is None, then bisection
253
support will be disabled and accessing the index will just stream
256
self._transport = transport
258
# Becomes a dict of key:(value, reference-list-byte-locations) used by
259
# the bisection interface to store parsed but not resolved keys.
260
self._bisect_nodes = None
261
# Becomes a dict of key:(value, reference-list-keys) which are ready to
262
# be returned directly to callers.
264
# a sorted list of slice-addresses for the parsed bytes of the file.
265
# e.g. (0,1) would mean that byte 0 is parsed.
266
self._parsed_byte_map = []
267
# a sorted list of keys matching each slice address for parsed bytes
268
# e.g. (None, 'foo@bar') would mean that the first byte contained no
269
# key, and the end byte of the slice is the of the data for 'foo@bar'
270
self._parsed_key_map = []
271
self._key_count = None
272
self._keys_by_offset = None
273
self._nodes_by_key = None
275
# The number of bytes we've read so far in trying to process this file
278
def __eq__(self, other):
279
"""Equal when self and other were created with the same parameters."""
281
type(self) == type(other) and
282
self._transport == other._transport and
283
self._name == other._name and
284
self._size == other._size)
286
def __ne__(self, other):
287
return not self.__eq__(other)
290
return "%s(%r)" % (self.__class__.__name__,
291
self._transport.abspath(self._name))
293
def _buffer_all(self, stream=None):
294
"""Buffer all the index data.
296
Mutates self._nodes and self.keys_by_offset.
298
if self._nodes is not None:
299
# We already did this
301
if 'index' in debug.debug_flags:
302
mutter('Reading entire index %s', self._transport.abspath(self._name))
304
stream = self._transport.get(self._name)
305
self._read_prefix(stream)
306
self._expected_elements = 3 + self._key_length
308
# raw data keyed by offset
309
self._keys_by_offset = {}
310
# ready-to-return key:value or key:value, node_ref_lists
312
self._nodes_by_key = {}
315
lines = stream.read().split('\n')
317
_, _, _, trailers = self._parse_lines(lines, pos)
318
for key, absent, references, value in self._keys_by_offset.itervalues():
321
# resolve references:
322
if self.node_ref_lists:
323
node_value = (value, self._resolve_references(references))
326
self._nodes[key] = node_value
327
if self._key_length > 1:
328
subkey = list(reversed(key[:-1]))
329
key_dict = self._nodes_by_key
330
if self.node_ref_lists:
331
key_value = key, node_value[0], node_value[1]
333
key_value = key, node_value
334
# possibly should do this on-demand, but it seems likely it is
336
# For a key of (foo, bar, baz) create
337
# _nodes_by_key[foo][bar][baz] = key_value
338
for subkey in key[:-1]:
339
key_dict = key_dict.setdefault(subkey, {})
340
key_dict[key[-1]] = key_value
341
# cache the keys for quick set intersections
342
self._keys = set(self._nodes)
344
# there must be one line - the empty trailer line.
345
raise errors.BadIndexData(self)
347
def iter_all_entries(self):
348
"""Iterate over all keys within the index.
350
:return: An iterable of (index, key, value) or (index, key, value, reference_lists).
351
The former tuple is used when there are no reference lists in the
352
index, making the API compatible with simple key:value index types.
353
There is no defined order for the result iteration - it will be in
354
the most efficient order for the index.
356
if 'evil' in debug.debug_flags:
357
trace.mutter_callsite(3,
358
"iter_all_entries scales with size of history.")
359
if self._nodes is None:
361
if self.node_ref_lists:
362
for key, (value, node_ref_lists) in self._nodes.iteritems():
363
yield self, key, value, node_ref_lists
365
for key, value in self._nodes.iteritems():
366
yield self, key, value
368
def _read_prefix(self, stream):
369
signature = stream.read(len(self._signature()))
370
if not signature == self._signature():
371
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
372
options_line = stream.readline()
373
if not options_line.startswith(_OPTION_NODE_REFS):
374
raise errors.BadIndexOptions(self)
376
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):-1])
378
raise errors.BadIndexOptions(self)
379
options_line = stream.readline()
380
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
381
raise errors.BadIndexOptions(self)
383
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):-1])
385
raise errors.BadIndexOptions(self)
386
options_line = stream.readline()
387
if not options_line.startswith(_OPTION_LEN):
388
raise errors.BadIndexOptions(self)
390
self._key_count = int(options_line[len(_OPTION_LEN):-1])
392
raise errors.BadIndexOptions(self)
394
def _resolve_references(self, references):
395
"""Return the resolved key references for references.
397
References are resolved by looking up the location of the key in the
398
_keys_by_offset map and substituting the key name, preserving ordering.
400
:param references: An iterable of iterables of key locations. e.g.
402
:return: A tuple of tuples of keys.
405
for ref_list in references:
406
node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
407
return tuple(node_refs)
409
def _find_index(self, range_map, key):
410
"""Helper for the _parsed_*_index calls.
412
Given a range map - [(start, end), ...], finds the index of the range
413
in the map for key if it is in the map, and if it is not there, the
414
immediately preceeding range in the map.
416
result = bisect_right(range_map, key) - 1
417
if result + 1 < len(range_map):
418
# check the border condition, it may be in result + 1
419
if range_map[result + 1][0] == key[0]:
423
def _parsed_byte_index(self, offset):
424
"""Return the index of the entry immediately before offset.
426
e.g. if the parsed map has regions 0,10 and 11,12 parsed, meaning that
427
there is one unparsed byte (the 11th, addressed as[10]). then:
428
asking for 0 will return 0
429
asking for 10 will return 0
430
asking for 11 will return 1
431
asking for 12 will return 1
434
return self._find_index(self._parsed_byte_map, key)
436
def _parsed_key_index(self, key):
437
"""Return the index of the entry immediately before key.
439
e.g. if the parsed map has regions (None, 'a') and ('b','c') parsed,
440
meaning that keys from None to 'a' inclusive, and 'b' to 'c' inclusive
441
have been parsed, then:
442
asking for '' will return 0
443
asking for 'a' will return 0
444
asking for 'b' will return 1
445
asking for 'e' will return 1
447
search_key = (key, None)
448
return self._find_index(self._parsed_key_map, search_key)
450
def _is_parsed(self, offset):
451
"""Returns True if offset has been parsed."""
452
index = self._parsed_byte_index(offset)
453
if index == len(self._parsed_byte_map):
454
return offset < self._parsed_byte_map[index - 1][1]
455
start, end = self._parsed_byte_map[index]
456
return offset >= start and offset < end
458
def _iter_entries_from_total_buffer(self, keys):
459
"""Iterate over keys when the entire index is parsed."""
460
keys = keys.intersection(self._keys)
461
if self.node_ref_lists:
463
value, node_refs = self._nodes[key]
464
yield self, key, value, node_refs
467
yield self, key, self._nodes[key]
469
def iter_entries(self, keys):
470
"""Iterate over keys within the index.
472
:param keys: An iterable providing the keys to be retrieved.
473
:return: An iterable as per iter_all_entries, but restricted to the
474
keys supplied. No additional keys will be returned, and every
475
key supplied that is in the index will be returned.
480
if self._size is None and self._nodes is None:
483
# We fit about 20 keys per minimum-read (4K), so if we are looking for
484
# more than 1/20th of the index its likely (assuming homogenous key
485
# spread) that we'll read the entire index. If we're going to do that,
486
# buffer the whole thing. A better analysis might take key spread into
487
# account - but B+Tree indices are better anyway.
488
# We could look at all data read, and use a threshold there, which will
489
# trigger on ancestry walks, but that is not yet fully mapped out.
490
if self._nodes is None and len(keys) * 20 > self.key_count():
492
if self._nodes is not None:
493
return self._iter_entries_from_total_buffer(keys)
495
return (result[1] for result in bisect_multi_bytes(
496
self._lookup_keys_via_location, self._size, keys))
498
def iter_entries_prefix(self, keys):
499
"""Iterate over keys within the index using prefix matching.
501
Prefix matching is applied within the tuple of a key, not to within
502
the bytestring of each key element. e.g. if you have the keys ('foo',
503
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
504
only the former key is returned.
506
WARNING: Note that this method currently causes a full index parse
507
unconditionally (which is reasonably appropriate as it is a means for
508
thunking many small indices into one larger one and still supplies
509
iter_all_entries at the thunk layer).
511
:param keys: An iterable providing the key prefixes to be retrieved.
512
Each key prefix takes the form of a tuple the length of a key, but
513
with the last N elements 'None' rather than a regular bytestring.
514
The first element cannot be 'None'.
515
:return: An iterable as per iter_all_entries, but restricted to the
516
keys with a matching prefix to those supplied. No additional keys
517
will be returned, and every match that is in the index will be
523
# load data - also finds key lengths
524
if self._nodes is None:
526
if self._key_length == 1:
530
raise errors.BadIndexKey(key)
531
if len(key) != self._key_length:
532
raise errors.BadIndexKey(key)
533
if self.node_ref_lists:
534
value, node_refs = self._nodes[key]
535
yield self, key, value, node_refs
537
yield self, key, self._nodes[key]
542
raise errors.BadIndexKey(key)
543
if len(key) != self._key_length:
544
raise errors.BadIndexKey(key)
545
# find what it refers to:
546
key_dict = self._nodes_by_key
548
# find the subdict whose contents should be returned.
550
while len(elements) and elements[0] is not None:
551
key_dict = key_dict[elements[0]]
554
# a non-existant lookup.
559
key_dict = dicts.pop(-1)
560
# can't be empty or would not exist
561
item, value = key_dict.iteritems().next()
562
if type(value) == dict:
564
dicts.extend(key_dict.itervalues())
567
for value in key_dict.itervalues():
568
# each value is the key:value:node refs tuple
570
yield (self, ) + value
572
# the last thing looked up was a terminal element
573
yield (self, ) + key_dict
576
"""Return an estimate of the number of keys in this index.
578
For GraphIndex the estimate is exact.
580
if self._key_count is None:
581
self._read_and_parse([_HEADER_READV])
582
return self._key_count
584
def _lookup_keys_via_location(self, location_keys):
585
"""Public interface for implementing bisection.
587
If _buffer_all has been called, then all the data for the index is in
588
memory, and this method should not be called, as it uses a separate
589
cache because it cannot pre-resolve all indices, which buffer_all does
592
:param location_keys: A list of location(byte offset), key tuples.
593
:return: A list of (location_key, result) tuples as expected by
594
bzrlib.bisect_multi.bisect_multi_bytes.
596
# Possible improvements:
597
# - only bisect lookup each key once
598
# - sort the keys first, and use that to reduce the bisection window
600
# this progresses in three parts:
603
# attempt to answer the question from the now in memory data.
604
# build the readv request
605
# for each location, ask for 800 bytes - much more than rows we've seen
608
for location, key in location_keys:
609
# can we answer from cache?
610
if self._bisect_nodes and key in self._bisect_nodes:
611
# We have the key parsed.
613
index = self._parsed_key_index(key)
614
if (len(self._parsed_key_map) and
615
self._parsed_key_map[index][0] <= key and
616
(self._parsed_key_map[index][1] >= key or
617
# end of the file has been parsed
618
self._parsed_byte_map[index][1] == self._size)):
619
# the key has been parsed, so no lookup is needed even if its
622
# - if we have examined this part of the file already - yes
623
index = self._parsed_byte_index(location)
624
if (len(self._parsed_byte_map) and
625
self._parsed_byte_map[index][0] <= location and
626
self._parsed_byte_map[index][1] > location):
627
# the byte region has been parsed, so no read is needed.
630
if location + length > self._size:
631
length = self._size - location
632
# todo, trim out parsed locations.
634
readv_ranges.append((location, length))
635
# read the header if needed
636
if self._bisect_nodes is None:
637
readv_ranges.append(_HEADER_READV)
638
self._read_and_parse(readv_ranges)
640
if self._nodes is not None:
641
# _read_and_parse triggered a _buffer_all because we requested the
643
for location, key in location_keys:
644
if key not in self._nodes: # not present
645
result.append(((location, key), False))
646
elif self.node_ref_lists:
647
value, refs = self._nodes[key]
648
result.append(((location, key),
649
(self, key, value, refs)))
651
result.append(((location, key),
652
(self, key, self._nodes[key])))
655
# - figure out <, >, missing, present
656
# - result present references so we can return them.
657
# keys that we cannot answer until we resolve references
658
pending_references = []
659
pending_locations = set()
660
for location, key in location_keys:
661
# can we answer from cache?
662
if key in self._bisect_nodes:
663
# the key has been parsed, so no lookup is needed
664
if self.node_ref_lists:
665
# the references may not have been all parsed.
666
value, refs = self._bisect_nodes[key]
667
wanted_locations = []
668
for ref_list in refs:
670
if ref not in self._keys_by_offset:
671
wanted_locations.append(ref)
673
pending_locations.update(wanted_locations)
674
pending_references.append((location, key))
676
result.append(((location, key), (self, key,
677
value, self._resolve_references(refs))))
679
result.append(((location, key),
680
(self, key, self._bisect_nodes[key])))
683
# has the region the key should be in, been parsed?
684
index = self._parsed_key_index(key)
685
if (self._parsed_key_map[index][0] <= key and
686
(self._parsed_key_map[index][1] >= key or
687
# end of the file has been parsed
688
self._parsed_byte_map[index][1] == self._size)):
689
result.append(((location, key), False))
691
# no, is the key above or below the probed location:
692
# get the range of the probed & parsed location
693
index = self._parsed_byte_index(location)
694
# if the key is below the start of the range, its below
695
if key < self._parsed_key_map[index][0]:
699
result.append(((location, key), direction))
701
# lookup data to resolve references
702
for location in pending_locations:
704
if location + length > self._size:
705
length = self._size - location
706
# TODO: trim out parsed locations (e.g. if the 800 is into the
707
# parsed region trim it, and dont use the adjust_for_latency
710
readv_ranges.append((location, length))
711
self._read_and_parse(readv_ranges)
712
if self._nodes is not None:
713
# The _read_and_parse triggered a _buffer_all, grab the data and
715
for location, key in pending_references:
716
value, refs = self._nodes[key]
717
result.append(((location, key), (self, key, value, refs)))
719
for location, key in pending_references:
720
# answer key references we had to look-up-late.
721
value, refs = self._bisect_nodes[key]
722
result.append(((location, key), (self, key,
723
value, self._resolve_references(refs))))
726
def _parse_header_from_bytes(self, bytes):
727
"""Parse the header from a region of bytes.
729
:param bytes: The data to parse.
730
:return: An offset, data tuple such as readv yields, for the unparsed
731
data. (which may length 0).
733
signature = bytes[0:len(self._signature())]
734
if not signature == self._signature():
735
raise errors.BadIndexFormatSignature(self._name, GraphIndex)
736
lines = bytes[len(self._signature()):].splitlines()
737
options_line = lines[0]
738
if not options_line.startswith(_OPTION_NODE_REFS):
739
raise errors.BadIndexOptions(self)
741
self.node_ref_lists = int(options_line[len(_OPTION_NODE_REFS):])
743
raise errors.BadIndexOptions(self)
744
options_line = lines[1]
745
if not options_line.startswith(_OPTION_KEY_ELEMENTS):
746
raise errors.BadIndexOptions(self)
748
self._key_length = int(options_line[len(_OPTION_KEY_ELEMENTS):])
750
raise errors.BadIndexOptions(self)
751
options_line = lines[2]
752
if not options_line.startswith(_OPTION_LEN):
753
raise errors.BadIndexOptions(self)
755
self._key_count = int(options_line[len(_OPTION_LEN):])
757
raise errors.BadIndexOptions(self)
758
# calculate the bytes we have processed
759
header_end = (len(signature) + len(lines[0]) + len(lines[1]) +
761
self._parsed_bytes(0, None, header_end, None)
762
# setup parsing state
763
self._expected_elements = 3 + self._key_length
764
# raw data keyed by offset
765
self._keys_by_offset = {}
766
# keys with the value and node references
767
self._bisect_nodes = {}
768
return header_end, bytes[header_end:]
770
def _parse_region(self, offset, data):
771
"""Parse node data returned from a readv operation.
773
:param offset: The byte offset the data starts at.
774
:param data: The data to parse.
778
end = offset + len(data)
781
# Trivial test - if the current index's end is within the
782
# low-matching parsed range, we're done.
783
index = self._parsed_byte_index(high_parsed)
784
if end < self._parsed_byte_map[index][1]:
786
# print "[%d:%d]" % (offset, end), \
787
# self._parsed_byte_map[index:index + 2]
788
high_parsed, last_segment = self._parse_segment(
789
offset, data, end, index)
793
def _parse_segment(self, offset, data, end, index):
794
"""Parse one segment of data.
796
:param offset: Where 'data' begins in the file.
797
:param data: Some data to parse a segment of.
798
:param end: Where data ends
799
:param index: The current index into the parsed bytes map.
800
:return: True if the parsed segment is the last possible one in the
802
:return: high_parsed_byte, last_segment.
803
high_parsed_byte is the location of the highest parsed byte in this
804
segment, last_segment is True if the parsed segment is the last
805
possible one in the data block.
807
# default is to use all data
809
# accomodate overlap with data before this.
810
if offset < self._parsed_byte_map[index][1]:
811
# overlaps the lower parsed region
812
# skip the parsed data
813
trim_start = self._parsed_byte_map[index][1] - offset
814
# don't trim the start for \n
815
start_adjacent = True
816
elif offset == self._parsed_byte_map[index][1]:
817
# abuts the lower parsed region
820
# do not trim anything
821
start_adjacent = True
823
# does not overlap the lower parsed region
826
# but trim the leading \n
827
start_adjacent = False
828
if end == self._size:
829
# lines up to the end of all data:
832
# do not strip to the last \n
835
elif index + 1 == len(self._parsed_byte_map):
836
# at the end of the parsed data
839
# but strip to the last \n
842
elif end == self._parsed_byte_map[index + 1][0]:
843
# buts up against the next parsed region
846
# do not strip to the last \n
849
elif end > self._parsed_byte_map[index + 1][0]:
850
# overlaps into the next parsed region
851
# only consider the unparsed data
852
trim_end = self._parsed_byte_map[index + 1][0] - offset
853
# do not strip to the last \n as we know its an entire record
855
last_segment = end < self._parsed_byte_map[index + 1][1]
857
# does not overlap into the next region
860
# but strip to the last \n
863
# now find bytes to discard if needed
864
if not start_adjacent:
865
# work around python bug in rfind
866
if trim_start is None:
867
trim_start = data.find('\n') + 1
869
trim_start = data.find('\n', trim_start) + 1
870
if not (trim_start != 0):
871
raise AssertionError('no \n was present')
872
# print 'removing start', offset, trim_start, repr(data[:trim_start])
874
# work around python bug in rfind
876
trim_end = data.rfind('\n') + 1
878
trim_end = data.rfind('\n', None, trim_end) + 1
879
if not (trim_end != 0):
880
raise AssertionError('no \n was present')
881
# print 'removing end', offset, trim_end, repr(data[trim_end:])
882
# adjust offset and data to the parseable data.
883
trimmed_data = data[trim_start:trim_end]
884
if not (trimmed_data):
885
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
886
% (trim_start, trim_end, offset, offset + len(data)))
889
# print "parsing", repr(trimmed_data)
890
# splitlines mangles the \r delimiters.. don't use it.
891
lines = trimmed_data.split('\n')
894
first_key, last_key, nodes, _ = self._parse_lines(lines, pos)
895
for key, value in nodes:
896
self._bisect_nodes[key] = value
897
self._parsed_bytes(offset, first_key,
898
offset + len(trimmed_data), last_key)
899
return offset + len(trimmed_data), last_segment
901
def _parse_lines(self, lines, pos):
910
if not (self._size == pos + 1):
911
raise AssertionError("%s %s" % (self._size, pos))
914
elements = line.split('\0')
915
if len(elements) != self._expected_elements:
916
raise errors.BadIndexData(self)
917
# keys are tuples. Each element is a string that may occur many
918
# times, so we intern them to save space. AB, RC, 200807
919
key = tuple(intern(element) for element in elements[:self._key_length])
920
if first_key is None:
922
absent, references, value = elements[-3:]
924
for ref_string in references.split('\t'):
925
ref_lists.append(tuple([
926
int(ref) for ref in ref_string.split('\r') if ref
928
ref_lists = tuple(ref_lists)
929
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
930
pos += len(line) + 1 # +1 for the \n
933
if self.node_ref_lists:
934
node_value = (value, ref_lists)
937
nodes.append((key, node_value))
938
# print "parsed ", key
939
return first_key, key, nodes, trailers
941
def _parsed_bytes(self, start, start_key, end, end_key):
942
"""Mark the bytes from start to end as parsed.
944
Calling self._parsed_bytes(1,2) will mark one byte (the one at offset
947
:param start: The start of the parsed region.
948
:param end: The end of the parsed region.
950
index = self._parsed_byte_index(start)
951
new_value = (start, end)
952
new_key = (start_key, end_key)
954
# first range parsed is always the beginning.
955
self._parsed_byte_map.insert(index, new_value)
956
self._parsed_key_map.insert(index, new_key)
960
# extend lower region
961
# extend higher region
962
# combine two regions
963
if (index + 1 < len(self._parsed_byte_map) and
964
self._parsed_byte_map[index][1] == start and
965
self._parsed_byte_map[index + 1][0] == end):
966
# combine two regions
967
self._parsed_byte_map[index] = (self._parsed_byte_map[index][0],
968
self._parsed_byte_map[index + 1][1])
969
self._parsed_key_map[index] = (self._parsed_key_map[index][0],
970
self._parsed_key_map[index + 1][1])
971
del self._parsed_byte_map[index + 1]
972
del self._parsed_key_map[index + 1]
973
elif self._parsed_byte_map[index][1] == start:
974
# extend the lower entry
975
self._parsed_byte_map[index] = (
976
self._parsed_byte_map[index][0], end)
977
self._parsed_key_map[index] = (
978
self._parsed_key_map[index][0], end_key)
979
elif (index + 1 < len(self._parsed_byte_map) and
980
self._parsed_byte_map[index + 1][0] == end):
981
# extend the higher entry
982
self._parsed_byte_map[index + 1] = (
983
start, self._parsed_byte_map[index + 1][1])
984
self._parsed_key_map[index + 1] = (
985
start_key, self._parsed_key_map[index + 1][1])
988
self._parsed_byte_map.insert(index + 1, new_value)
989
self._parsed_key_map.insert(index + 1, new_key)
991
def _read_and_parse(self, readv_ranges):
992
"""Read the the ranges and parse the resulting data.
994
:param readv_ranges: A prepared readv range list.
998
if self._nodes is None and self._bytes_read * 2 >= self._size:
999
# We've already read more than 50% of the file and we are about to
1000
# request more data, just _buffer_all() and be done
1004
readv_data = self._transport.readv(self._name, readv_ranges, True,
1007
for offset, data in readv_data:
1008
self._bytes_read += len(data)
1009
if offset == 0 and len(data) == self._size:
1010
# We read the whole range, most likely because the
1011
# Transport upcast our readv ranges into one long request
1012
# for enough total data to grab the whole index.
1013
self._buffer_all(StringIO(data))
1015
if self._bisect_nodes is None:
1016
# this must be the start
1017
if not (offset == 0):
1018
raise AssertionError()
1019
offset, data = self._parse_header_from_bytes(data)
1020
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1021
self._parse_region(offset, data)
1023
def _signature(self):
1024
"""The file signature for this index type."""
1028
"""Validate that everything in the index can be accessed."""
1029
# iter_all validates completely at the moment, so just do that.
1030
for node in self.iter_all_entries():
1034
class CombinedGraphIndex(object):
1035
"""A GraphIndex made up from smaller GraphIndices.
1037
The backing indices must implement GraphIndex, and are presumed to be
1040
Queries against the combined index will be made against the first index,
1041
and then the second and so on. The order of index's can thus influence
1042
performance significantly. For example, if one index is on local disk and a
1043
second on a remote server, the local disk index should be before the other
1047
def __init__(self, indices):
1048
"""Create a CombinedGraphIndex backed by indices.
1050
:param indices: An ordered list of indices to query for data.
1052
self._indices = indices
1056
self.__class__.__name__,
1057
', '.join(map(repr, self._indices)))
1059
@symbol_versioning.deprecated_method(symbol_versioning.one_one)
1060
def get_parents(self, revision_ids):
1061
"""See graph._StackedParentsProvider.get_parents.
1063
This implementation thunks the graph.Graph.get_parents api across to
1066
:param revision_ids: An iterable of graph keys for this graph.
1067
:return: A list of parent details for each key in revision_ids.
1068
Each parent details will be one of:
1069
* None when the key was missing
1070
* (NULL_REVISION,) when the key has no parents.
1071
* (parent_key, parent_key...) otherwise.
1073
parent_map = self.get_parent_map(revision_ids)
1074
return [parent_map.get(r, None) for r in revision_ids]
1076
def get_parent_map(self, keys):
1077
"""See graph._StackedParentsProvider.get_parent_map"""
1078
search_keys = set(keys)
1079
if NULL_REVISION in search_keys:
1080
search_keys.discard(NULL_REVISION)
1081
found_parents = {NULL_REVISION:[]}
1084
for index, key, value, refs in self.iter_entries(search_keys):
1087
parents = (NULL_REVISION,)
1088
found_parents[key] = parents
1089
return found_parents
1091
def insert_index(self, pos, index):
1092
"""Insert a new index in the list of indices to query.
1094
:param pos: The position to insert the index.
1095
:param index: The index to insert.
1097
self._indices.insert(pos, index)
1099
def iter_all_entries(self):
1100
"""Iterate over all keys within the index
1102
Duplicate keys across child indices are presumed to have the same
1103
value and are only reported once.
1105
:return: An iterable of (index, key, reference_lists, value).
1106
There is no defined order for the result iteration - it will be in
1107
the most efficient order for the index.
1110
for index in self._indices:
1111
for node in index.iter_all_entries():
1112
if node[1] not in seen_keys:
1114
seen_keys.add(node[1])
1116
def iter_entries(self, keys):
1117
"""Iterate over keys within the index.
1119
Duplicate keys across child indices are presumed to have the same
1120
value and are only reported once.
1122
:param keys: An iterable providing the keys to be retrieved.
1123
:return: An iterable of (index, key, reference_lists, value). There is no
1124
defined order for the result iteration - it will be in the most
1125
efficient order for the index.
1128
for index in self._indices:
1131
for node in index.iter_entries(keys):
1132
keys.remove(node[1])
1135
def iter_entries_prefix(self, keys):
1136
"""Iterate over keys within the index using prefix matching.
1138
Duplicate keys across child indices are presumed to have the same
1139
value and are only reported once.
1141
Prefix matching is applied within the tuple of a key, not to within
1142
the bytestring of each key element. e.g. if you have the keys ('foo',
1143
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1144
only the former key is returned.
1146
:param keys: An iterable providing the key prefixes to be retrieved.
1147
Each key prefix takes the form of a tuple the length of a key, but
1148
with the last N elements 'None' rather than a regular bytestring.
1149
The first element cannot be 'None'.
1150
:return: An iterable as per iter_all_entries, but restricted to the
1151
keys with a matching prefix to those supplied. No additional keys
1152
will be returned, and every match that is in the index will be
1159
for index in self._indices:
1160
for node in index.iter_entries_prefix(keys):
1161
if node[1] in seen_keys:
1163
seen_keys.add(node[1])
1166
def key_count(self):
1167
"""Return an estimate of the number of keys in this index.
1169
For CombinedGraphIndex this is approximated by the sum of the keys of
1170
the child indices. As child indices may have duplicate keys this can
1171
have a maximum error of the number of child indices * largest number of
1174
return sum((index.key_count() for index in self._indices), 0)
1177
"""Validate that everything in the index can be accessed."""
1178
for index in self._indices:
1182
class InMemoryGraphIndex(GraphIndexBuilder):
1183
"""A GraphIndex which operates entirely out of memory and is mutable.
1185
This is designed to allow the accumulation of GraphIndex entries during a
1186
single write operation, where the accumulated entries need to be immediately
1187
available - for example via a CombinedGraphIndex.
1190
def add_nodes(self, nodes):
1191
"""Add nodes to the index.
1193
:param nodes: An iterable of (key, node_refs, value) entries to add.
1195
if self.reference_lists:
1196
for (key, value, node_refs) in nodes:
1197
self.add_node(key, value, node_refs)
1199
for (key, value) in nodes:
1200
self.add_node(key, value)
1202
def iter_all_entries(self):
1203
"""Iterate over all keys within the index
1205
:return: An iterable of (index, key, reference_lists, value). There is no
1206
defined order for the result iteration - it will be in the most
1207
efficient order for the index (in this case dictionary hash order).
1209
if 'evil' in debug.debug_flags:
1210
trace.mutter_callsite(3,
1211
"iter_all_entries scales with size of history.")
1212
if self.reference_lists:
1213
for key, (absent, references, value) in self._nodes.iteritems():
1215
yield self, key, value, references
1217
for key, (absent, references, value) in self._nodes.iteritems():
1219
yield self, key, value
1221
def iter_entries(self, keys):
1222
"""Iterate over keys within the index.
1224
:param keys: An iterable providing the keys to be retrieved.
1225
:return: An iterable of (index, key, value, reference_lists). There is no
1226
defined order for the result iteration - it will be in the most
1227
efficient order for the index (keys iteration order in this case).
1230
if self.reference_lists:
1231
for key in keys.intersection(self._keys):
1232
node = self._nodes[key]
1234
yield self, key, node[2], node[1]
1236
for key in keys.intersection(self._keys):
1237
node = self._nodes[key]
1239
yield self, key, node[2]
1241
def iter_entries_prefix(self, keys):
1242
"""Iterate over keys within the index using prefix matching.
1244
Prefix matching is applied within the tuple of a key, not to within
1245
the bytestring of each key element. e.g. if you have the keys ('foo',
1246
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1247
only the former key is returned.
1249
:param keys: An iterable providing the key prefixes to be retrieved.
1250
Each key prefix takes the form of a tuple the length of a key, but
1251
with the last N elements 'None' rather than a regular bytestring.
1252
The first element cannot be 'None'.
1253
:return: An iterable as per iter_all_entries, but restricted to the
1254
keys with a matching prefix to those supplied. No additional keys
1255
will be returned, and every match that is in the index will be
1258
# XXX: To much duplication with the GraphIndex class; consider finding
1259
# a good place to pull out the actual common logic.
1263
if self._key_length == 1:
1267
raise errors.BadIndexKey(key)
1268
if len(key) != self._key_length:
1269
raise errors.BadIndexKey(key)
1270
node = self._nodes[key]
1273
if self.reference_lists:
1274
yield self, key, node[2], node[1]
1276
yield self, key, node[2]
1281
raise errors.BadIndexKey(key)
1282
if len(key) != self._key_length:
1283
raise errors.BadIndexKey(key)
1284
# find what it refers to:
1285
key_dict = self._nodes_by_key
1286
elements = list(key)
1287
# find the subdict to return
1289
while len(elements) and elements[0] is not None:
1290
key_dict = key_dict[elements[0]]
1293
# a non-existant lookup.
1298
key_dict = dicts.pop(-1)
1299
# can't be empty or would not exist
1300
item, value = key_dict.iteritems().next()
1301
if type(value) == dict:
1303
dicts.extend(key_dict.itervalues())
1306
for value in key_dict.itervalues():
1307
yield (self, ) + value
1309
yield (self, ) + key_dict
1311
def key_count(self):
1312
"""Return an estimate of the number of keys in this index.
1314
For InMemoryGraphIndex the estimate is exact.
1316
return len(self._keys)
1319
"""In memory index's have no known corruption at the moment."""
1322
class GraphIndexPrefixAdapter(object):
1323
"""An adapter between GraphIndex with different key lengths.
1325
Queries against this will emit queries against the adapted Graph with the
1326
prefix added, queries for all items use iter_entries_prefix. The returned
1327
nodes will have their keys and node references adjusted to remove the
1328
prefix. Finally, an add_nodes_callback can be supplied - when called the
1329
nodes and references being added will have prefix prepended.
1332
def __init__(self, adapted, prefix, missing_key_length,
1333
add_nodes_callback=None):
1334
"""Construct an adapter against adapted with prefix."""
1335
self.adapted = adapted
1336
self.prefix_key = prefix + (None,)*missing_key_length
1337
self.prefix = prefix
1338
self.prefix_len = len(prefix)
1339
self.add_nodes_callback = add_nodes_callback
1341
def add_nodes(self, nodes):
1342
"""Add nodes to the index.
1344
:param nodes: An iterable of (key, node_refs, value) entries to add.
1346
# save nodes in case its an iterator
1347
nodes = tuple(nodes)
1348
translated_nodes = []
1350
# Add prefix_key to each reference node_refs is a tuple of tuples,
1351
# so split it apart, and add prefix_key to the internal reference
1352
for (key, value, node_refs) in nodes:
1353
adjusted_references = (
1354
tuple(tuple(self.prefix + ref_node for ref_node in ref_list)
1355
for ref_list in node_refs))
1356
translated_nodes.append((self.prefix + key, value,
1357
adjusted_references))
1359
# XXX: TODO add an explicit interface for getting the reference list
1360
# status, to handle this bit of user-friendliness in the API more
1362
for (key, value) in nodes:
1363
translated_nodes.append((self.prefix + key, value))
1364
self.add_nodes_callback(translated_nodes)
1366
def add_node(self, key, value, references=()):
1367
"""Add a node to the index.
1369
:param key: The key. keys are non-empty tuples containing
1370
as many whitespace-free utf8 bytestrings as the key length
1371
defined for this index.
1372
:param references: An iterable of iterables of keys. Each is a
1373
reference to another key.
1374
:param value: The value to associate with the key. It may be any
1375
bytes as long as it does not contain \0 or \n.
1377
self.add_nodes(((key, value, references), ))
1379
def _strip_prefix(self, an_iter):
1380
"""Strip prefix data from nodes and return it."""
1381
for node in an_iter:
1383
if node[1][:self.prefix_len] != self.prefix:
1384
raise errors.BadIndexData(self)
1385
for ref_list in node[3]:
1386
for ref_node in ref_list:
1387
if ref_node[:self.prefix_len] != self.prefix:
1388
raise errors.BadIndexData(self)
1389
yield node[0], node[1][self.prefix_len:], node[2], (
1390
tuple(tuple(ref_node[self.prefix_len:] for ref_node in ref_list)
1391
for ref_list in node[3]))
1393
def iter_all_entries(self):
1394
"""Iterate over all keys within the index
1396
iter_all_entries is implemented against the adapted index using
1397
iter_entries_prefix.
1399
:return: An iterable of (index, key, reference_lists, value). There is no
1400
defined order for the result iteration - it will be in the most
1401
efficient order for the index (in this case dictionary hash order).
1403
return self._strip_prefix(self.adapted.iter_entries_prefix([self.prefix_key]))
1405
def iter_entries(self, keys):
1406
"""Iterate over keys within the index.
1408
:param keys: An iterable providing the keys to be retrieved.
1409
:return: An iterable of (index, key, value, reference_lists). There is no
1410
defined order for the result iteration - it will be in the most
1411
efficient order for the index (keys iteration order in this case).
1413
return self._strip_prefix(self.adapted.iter_entries(
1414
self.prefix + key for key in keys))
1416
def iter_entries_prefix(self, keys):
1417
"""Iterate over keys within the index using prefix matching.
1419
Prefix matching is applied within the tuple of a key, not to within
1420
the bytestring of each key element. e.g. if you have the keys ('foo',
1421
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
1422
only the former key is returned.
1424
:param keys: An iterable providing the key prefixes to be retrieved.
1425
Each key prefix takes the form of a tuple the length of a key, but
1426
with the last N elements 'None' rather than a regular bytestring.
1427
The first element cannot be 'None'.
1428
:return: An iterable as per iter_all_entries, but restricted to the
1429
keys with a matching prefix to those supplied. No additional keys
1430
will be returned, and every match that is in the index will be
1433
return self._strip_prefix(self.adapted.iter_entries_prefix(
1434
self.prefix + key for key in keys))
1436
def key_count(self):
1437
"""Return an estimate of the number of keys in this index.
1439
For GraphIndexPrefixAdapter this is relatively expensive - key
1440
iteration with the prefix is done.
1442
return len(list(self.iter_all_entries()))
1445
"""Call the adapted's validate."""
1446
self.adapted.validate()