94
95
if not element or _whitespace_re.search(element) is not None:
95
96
raise errors.BadIndexKey(element)
97
def add_node(self, key, value, references=()):
98
"""Add a node to the index.
100
:param key: The key. keys are non-empty tuples containing
101
as many whitespace-free utf8 bytestrings as the key length
102
defined for this index.
103
:param references: An iterable of iterables of keys. Each is a
104
reference to another key.
105
:param value: The value to associate with the key. It may be any
106
bytes as long as it does not contain \0 or \n.
98
def _get_nodes_by_key(self):
99
if self._nodes_by_key is None:
101
if self.reference_lists:
102
for key, (absent, references, value) in self._nodes.iteritems():
105
key_dict = nodes_by_key
106
for subkey in key[:-1]:
107
key_dict = key_dict.setdefault(subkey, {})
108
key_dict[key[-1]] = key, value, references
110
for key, (absent, references, value) in self._nodes.iteritems():
113
key_dict = nodes_by_key
114
for subkey in key[:-1]:
115
key_dict = key_dict.setdefault(subkey, {})
116
key_dict[key[-1]] = key, value
117
self._nodes_by_key = nodes_by_key
118
return self._nodes_by_key
120
def _update_nodes_by_key(self, key, value, node_refs):
121
"""Update the _nodes_by_key dict with a new key.
123
For a key of (foo, bar, baz) create
124
_nodes_by_key[foo][bar][baz] = key_value
126
if self._nodes_by_key is None:
128
key_dict = self._nodes_by_key
129
if self.reference_lists:
130
key_value = key, value, node_refs
132
key_value = key, value
133
for subkey in key[:-1]:
134
key_dict = key_dict.setdefault(subkey, {})
135
key_dict[key[-1]] = key_value
137
def _check_key_ref_value(self, key, references, value):
138
"""Check that 'key' and 'references' are all valid.
140
:param key: A key tuple. Must conform to the key interface (be a tuple,
141
be of the right length, not have any whitespace or nulls in any key
143
:param references: An iterable of reference lists. Something like
144
[[(ref, key)], [(ref, key), (other, key)]]
145
:param value: The value associate with this key. Must not contain
146
newlines or null characters.
147
:return: (node_refs, absent_references)
148
node_refs basically a packed form of 'references' where all
150
absent_references reference keys that are not in self._nodes.
151
This may contain duplicates if the same key is
152
referenced in multiple lists.
108
154
self._check_key(key)
109
155
if _newline_null_re.search(value) is not None:
111
157
if len(references) != self.reference_lists:
112
158
raise errors.BadIndexValue(references)
160
absent_references = []
114
161
for reference_list in references:
115
162
for reference in reference_list:
116
self._check_key(reference)
163
# If reference *is* in self._nodes, then we know it has already
117
165
if reference not in self._nodes:
118
self._nodes[reference] = ('a', (), '')
166
self._check_key(reference)
167
absent_references.append(reference)
119
168
node_refs.append(tuple(reference_list))
120
if key in self._nodes and self._nodes[key][0] == '':
169
return tuple(node_refs), absent_references
171
def add_node(self, key, value, references=()):
172
"""Add a node to the index.
174
:param key: The key. keys are non-empty tuples containing
175
as many whitespace-free utf8 bytestrings as the key length
176
defined for this index.
177
:param references: An iterable of iterables of keys. Each is a
178
reference to another key.
179
:param value: The value to associate with the key. It may be any
180
bytes as long as it does not contain \0 or \n.
183
absent_references) = self._check_key_ref_value(key, references, value)
184
if key in self._nodes and self._nodes[key][0] != 'a':
121
185
raise errors.BadIndexDuplicateKey(key, self)
122
self._nodes[key] = ('', tuple(node_refs), value)
186
for reference in absent_references:
187
# There may be duplicates, but I don't think it is worth worrying
189
self._nodes[reference] = ('a', (), '')
190
self._nodes[key] = ('', node_refs, value)
123
191
self._keys.add(key)
124
if self._key_length > 1:
125
key_dict = self._nodes_by_key
126
if self.reference_lists:
127
key_value = key, value, tuple(node_refs)
129
key_value = key, value
130
# possibly should do this on-demand, but it seems likely it is
132
# For a key of (foo, bar, baz) create
133
# _nodes_by_key[foo][bar][baz] = key_value
134
for subkey in key[:-1]:
135
key_dict = key_dict.setdefault(subkey, {})
136
key_dict[key[-1]] = key_value
192
if self._nodes_by_key is not None and self._key_length > 1:
193
self._update_nodes_by_key(key, value, node_refs)
138
195
def finish(self):
139
196
lines = [_SIGNATURE]
142
199
lines.append(_OPTION_LEN + str(len(self._keys)) + '\n')
143
200
prefix_length = sum(len(x) for x in lines)
144
201
# references are byte offsets. To avoid having to do nasty
145
# polynomial work to resolve offsets (references to later in the
202
# polynomial work to resolve offsets (references to later in the
146
203
# file cannot be determined until all the inbetween references have
147
204
# been calculated too) we pad the offsets with 0's to make them be
148
205
# of consistent length. Using binary offsets would break the trivial
284
343
def __ne__(self, other):
285
344
return not self.__eq__(other)
287
def _buffer_all(self):
347
return "%s(%r)" % (self.__class__.__name__,
348
self._transport.abspath(self._name))
350
def _buffer_all(self, stream=None):
288
351
"""Buffer all the index data.
290
353
Mutates self._nodes and self.keys_by_offset.
355
if self._nodes is not None:
356
# We already did this
292
358
if 'index' in debug.debug_flags:
293
359
mutter('Reading entire index %s', self._transport.abspath(self._name))
294
stream = self._transport.get(self._name)
361
stream = self._transport.get(self._name)
295
362
self._read_prefix(stream)
296
363
self._expected_elements = 3 + self._key_length
315
382
node_value = value
316
383
self._nodes[key] = node_value
317
384
if self._key_length > 1:
318
subkey = list(reversed(key[:-1]))
385
# TODO: We may want to do this lazily, but if we are calling
386
# _buffer_all, we are likely to be doing
387
# iter_entries_prefix
319
388
key_dict = self._nodes_by_key
320
389
if self.node_ref_lists:
321
390
key_value = key, node_value[0], node_value[1]
323
392
key_value = key, node_value
324
# possibly should do this on-demand, but it seems likely it is
326
393
# For a key of (foo, bar, baz) create
327
394
# _nodes_by_key[foo][bar][baz] = key_value
328
395
for subkey in key[:-1]:
464
531
keys supplied. No additional keys will be returned, and every
465
532
key supplied that is in the index will be returned.
467
# PERFORMANCE TODO: parse and bisect all remaining data at some
468
# threshold of total-index processing/get calling layers that expect to
469
# read the entire index to use the iter_all_entries method instead.
473
537
if self._size is None and self._nodes is None:
474
538
self._buffer_all()
540
# We fit about 20 keys per minimum-read (4K), so if we are looking for
541
# more than 1/20th of the index its likely (assuming homogenous key
542
# spread) that we'll read the entire index. If we're going to do that,
543
# buffer the whole thing. A better analysis might take key spread into
544
# account - but B+Tree indices are better anyway.
545
# We could look at all data read, and use a threshold there, which will
546
# trigger on ancestry walks, but that is not yet fully mapped out.
547
if self._nodes is None and len(keys) * 20 > self.key_count():
475
549
if self._nodes is not None:
476
550
return self._iter_entries_from_total_buffer(keys)
619
693
if self._bisect_nodes is None:
620
694
readv_ranges.append(_HEADER_READV)
621
695
self._read_and_parse(readv_ranges)
697
if self._nodes is not None:
698
# _read_and_parse triggered a _buffer_all because we requested the
700
for location, key in location_keys:
701
if key not in self._nodes: # not present
702
result.append(((location, key), False))
703
elif self.node_ref_lists:
704
value, refs = self._nodes[key]
705
result.append(((location, key),
706
(self, key, value, refs)))
708
result.append(((location, key),
709
(self, key, self._nodes[key])))
622
711
# generate results:
623
712
# - figure out <, >, missing, present
624
713
# - result present references so we can return them.
626
714
# keys that we cannot answer until we resolve references
627
715
pending_references = []
628
716
pending_locations = set()
679
767
readv_ranges.append((location, length))
680
768
self._read_and_parse(readv_ranges)
769
if self._nodes is not None:
770
# The _read_and_parse triggered a _buffer_all, grab the data and
772
for location, key in pending_references:
773
value, refs = self._nodes[key]
774
result.append(((location, key), (self, key, value, refs)))
681
776
for location, key in pending_references:
682
777
# answer key references we had to look-up-late.
683
index = self._parsed_key_index(key)
684
778
value, refs = self._bisect_nodes[key]
685
779
result.append(((location, key), (self, key,
686
780
value, self._resolve_references(refs))))
830
924
trim_start = data.find('\n') + 1
832
926
trim_start = data.find('\n', trim_start) + 1
833
assert trim_start != 0, 'no \n was present'
927
if not (trim_start != 0):
928
raise AssertionError('no \n was present')
834
929
# print 'removing start', offset, trim_start, repr(data[:trim_start])
835
930
if not end_adjacent:
836
931
# work around python bug in rfind
838
933
trim_end = data.rfind('\n') + 1
840
935
trim_end = data.rfind('\n', None, trim_end) + 1
841
assert trim_end != 0, 'no \n was present'
936
if not (trim_end != 0):
937
raise AssertionError('no \n was present')
842
938
# print 'removing end', offset, trim_end, repr(data[trim_end:])
843
939
# adjust offset and data to the parseable data.
844
940
trimmed_data = data[trim_start:trim_end]
845
assert trimmed_data, 'read unneeded data [%d:%d] from [%d:%d]' % (
846
trim_start, trim_end, offset, offset + len(data))
941
if not (trimmed_data):
942
raise AssertionError('read unneeded data [%d:%d] from [%d:%d]'
943
% (trim_start, trim_end, offset, offset + len(data)))
848
945
offset += trim_start
849
946
# print "parsing", repr(trimmed_data)
868
965
# must be at the end
870
assert self._size == pos + 1, "%s %s" % (self._size, pos)
967
if not (self._size == pos + 1):
968
raise AssertionError("%s %s" % (self._size, pos))
873
971
elements = line.split('\0')
874
972
if len(elements) != self._expected_elements:
875
973
raise errors.BadIndexData(self)
877
key = tuple(elements[:self._key_length])
974
# keys are tuples. Each element is a string that may occur many
975
# times, so we intern them to save space. AB, RC, 200807
976
key = tuple(intern(element) for element in elements[:self._key_length])
878
977
if first_key is None:
880
979
absent, references, value = elements[-3:]
952
1051
:param readv_ranges: A prepared readv range list.
955
readv_data = self._transport.readv(self._name, readv_ranges, True,
958
for offset, data in readv_data:
959
if self._bisect_nodes is None:
960
# this must be the start
962
offset, data = self._parse_header_from_bytes(data)
963
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
964
self._parse_region(offset, data)
1053
if not readv_ranges:
1055
if self._nodes is None and self._bytes_read * 2 >= self._size:
1056
# We've already read more than 50% of the file and we are about to
1057
# request more data, just _buffer_all() and be done
1061
readv_data = self._transport.readv(self._name, readv_ranges, True,
1064
for offset, data in readv_data:
1065
self._bytes_read += len(data)
1066
if offset == 0 and len(data) == self._size:
1067
# We read the whole range, most likely because the
1068
# Transport upcast our readv ranges into one long request
1069
# for enough total data to grab the whole index.
1070
self._buffer_all(StringIO(data))
1072
if self._bisect_nodes is None:
1073
# this must be the start
1074
if not (offset == 0):
1075
raise AssertionError()
1076
offset, data = self._parse_header_from_bytes(data)
1077
# print readv_ranges, "[%d:%d]" % (offset, offset + len(data))
1078
self._parse_region(offset, data)
966
1080
def _signature(self):
967
1081
"""The file signature for this index type."""