111
134
class KnitContent(object):
112
135
"""Content of a knit version to which deltas can be applied."""
114
def __init__(self, lines):
117
def annotate_iter(self):
118
"""Yield tuples of (origin, text) for each content line."""
119
for origin, text in self._lines:
122
137
def annotate(self):
123
138
"""Return a list of (origin, text) tuples."""
124
139
return list(self.annotate_iter())
141
def apply_delta(self, delta, new_version_id):
142
"""Apply delta to this object to become new_version_id."""
143
raise NotImplementedError(self.apply_delta)
126
145
def line_delta_iter(self, new_lines):
127
146
"""Generate line-based delta from this content to new_lines."""
128
new_texts = [text for origin, text in new_lines._lines]
129
old_texts = [text for origin, text in self._lines]
130
s = KnitSequenceMatcher(None, old_texts, new_texts)
131
for op in s.get_opcodes():
147
new_texts = new_lines.text()
148
old_texts = self.text()
149
s = patiencediff.PatienceSequenceMatcher(None, old_texts, new_texts)
150
for tag, i1, i2, j1, j2 in s.get_opcodes():
134
# ofrom oto length data
135
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
153
# ofrom, oto, length, data
154
yield i1, i2, j2 - j1, new_lines._lines[j1:j2]
137
156
def line_delta(self, new_lines):
138
157
return list(self.line_delta_iter(new_lines))
141
return [text for origin, text in self._lines]
144
return KnitContent(self._lines[:])
147
class _KnitFactory(object):
148
"""Base factory for creating content objects."""
150
def make(self, lines, version):
151
num_lines = len(lines)
152
return KnitContent(zip([version] * num_lines, lines))
155
class KnitAnnotateFactory(_KnitFactory):
160
def get_line_delta_blocks(knit_delta, source, target):
161
"""Extract SequenceMatcher.get_matching_blocks() from a knit delta"""
162
target_len = len(target)
165
for s_begin, s_end, t_len, new_text in knit_delta:
166
true_n = s_begin - s_pos
169
# knit deltas do not provide reliable info about whether the
170
# last line of a file matches, due to eol handling.
171
if source[s_pos + n -1] != target[t_pos + n -1]:
174
yield s_pos, t_pos, n
175
t_pos += t_len + true_n
177
n = target_len - t_pos
179
if source[s_pos + n -1] != target[t_pos + n -1]:
182
yield s_pos, t_pos, n
183
yield s_pos + (target_len - t_pos), target_len, 0
186
class AnnotatedKnitContent(KnitContent):
187
"""Annotated content."""
189
def __init__(self, lines):
192
def annotate_iter(self):
193
"""Yield tuples of (origin, text) for each content line."""
194
return iter(self._lines)
196
def apply_delta(self, delta, new_version_id):
197
"""Apply delta to this object to become new_version_id."""
200
for start, end, count, delta_lines in delta:
201
lines[offset+start:offset+end] = delta_lines
202
offset = offset + (start - end) + count
204
def strip_last_line_newline(self):
205
line = self._lines[-1][1].rstrip('\n')
206
self._lines[-1] = (self._lines[-1][0], line)
210
return [text for origin, text in self._lines]
211
except ValueError, e:
212
# most commonly (only?) caused by the internal form of the knit
213
# missing annotation information because of a bug - see thread
215
raise KnitCorrupt(self,
216
"line in annotated knit missing annotation information: %s"
220
return AnnotatedKnitContent(self._lines[:])
223
class PlainKnitContent(KnitContent):
224
"""Unannotated content.
226
When annotate[_iter] is called on this content, the same version is reported
227
for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent
231
def __init__(self, lines, version_id):
233
self._version_id = version_id
235
def annotate_iter(self):
236
"""Yield tuples of (origin, text) for each content line."""
237
for line in self._lines:
238
yield self._version_id, line
240
def apply_delta(self, delta, new_version_id):
241
"""Apply delta to this object to become new_version_id."""
244
for start, end, count, delta_lines in delta:
245
lines[offset+start:offset+end] = delta_lines
246
offset = offset + (start - end) + count
247
self._version_id = new_version_id
250
return PlainKnitContent(self._lines[:], self._version_id)
252
def strip_last_line_newline(self):
253
self._lines[-1] = self._lines[-1].rstrip('\n')
259
class KnitAnnotateFactory(object):
156
260
"""Factory for creating annotated Content objects."""
160
def parse_fulltext(self, content, version):
264
def make(self, lines, version_id):
265
num_lines = len(lines)
266
return AnnotatedKnitContent(zip([version_id] * num_lines, lines))
268
def parse_fulltext(self, content, version_id):
161
269
"""Convert fulltext to internal representation
163
271
fulltext content is of the format
185
292
revid(utf8) newline\n
186
293
internal representation is
187
294
(start, end, count, [1..count tuples (revid, newline)])
296
:param plain: If True, the lines are returned as a plain
297
list without annotations, not as a list of (origin, content) tuples, i.e.
298
(start, end, count, [1..count newline])
189
decode_utf8 = cache_utf8.decode
191
301
lines = iter(lines)
192
302
next = lines.next
305
def cache_and_return(line):
306
origin, text = line.split(' ', 1)
307
return cache.setdefault(origin, origin), text
193
309
# walk through the lines parsing.
310
# Note that the plain test is explicitly pulled out of the
311
# loop to minimise any performance impact
314
start, end, count = [int(n) for n in header.split(',')]
315
contents = [next().split(' ', 1)[1] for i in xrange(count)]
316
result.append((start, end, count, contents))
319
start, end, count = [int(n) for n in header.split(',')]
320
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
321
result.append((start, end, count, contents))
324
def get_fulltext_content(self, lines):
325
"""Extract just the content lines from a fulltext."""
326
return (line.split(' ', 1)[1] for line in lines)
328
def get_linedelta_content(self, lines):
329
"""Extract just the content from a line delta.
331
This doesn't return all of the extra information stored in a delta.
332
Only the actual content lines.
194
336
for header in lines:
195
start, end, count = [int(n) for n in header.split(',')]
337
header = header.split(',')
338
count = int(header[2])
339
for i in xrange(count):
199
340
origin, text = next().split(' ', 1)
201
contents.append((decode_utf8(origin), text))
202
result.append((start, end, count, contents))
205
343
def lower_fulltext(self, content):
206
344
"""convert a fulltext content record into a serializable form.
208
346
see parse_fulltext which this inverts.
210
encode_utf8 = cache_utf8.encode
211
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
348
# TODO: jam 20070209 We only do the caching thing to make sure that
349
# the origin is a valid utf-8 line, eventually we could remove it
350
return ['%s %s' % (o, t) for o, t in content._lines]
213
352
def lower_line_delta(self, delta):
214
353
"""convert a delta into a serializable form.
216
355
See parse_line_delta which this inverts.
218
encode_utf8 = cache_utf8.encode
357
# TODO: jam 20070209 We only do the caching thing to make sure that
358
# the origin is a valid utf-8 line, eventually we could remove it
220
360
for start, end, c, lines in delta:
221
361
out.append('%d,%d,%d\n' % (start, end, c))
222
out.extend(encode_utf8(origin) + ' ' + text
362
out.extend(origin + ' ' + text
223
363
for origin, text in lines)
227
class KnitPlainFactory(_KnitFactory):
366
def annotate_iter(self, knit, version_id):
367
content = knit._get_content(version_id)
368
return content.annotate_iter()
371
class KnitPlainFactory(object):
228
372
"""Factory for creating plain Content objects."""
230
374
annotated = False
232
def parse_fulltext(self, content, version):
376
def make(self, lines, version_id):
377
return PlainKnitContent(lines, version_id)
379
def parse_fulltext(self, content, version_id):
233
380
"""This parses an unannotated fulltext.
235
382
Note that this is not a noop - the internal representation
236
383
has (versionid, line) - its just a constant versionid.
238
return self.make(content, version)
385
return self.make(content, version_id)
240
def parse_line_delta_iter(self, lines, version):
242
header = lines.pop(0)
387
def parse_line_delta_iter(self, lines, version_id):
389
num_lines = len(lines)
390
while cur < num_lines:
243
393
start, end, c = [int(n) for n in header.split(',')]
244
yield start, end, c, zip([version] * c, lines[:c])
247
def parse_line_delta(self, lines, version):
248
return list(self.parse_line_delta_iter(lines, version))
394
yield start, end, c, lines[cur:cur+c]
397
def parse_line_delta(self, lines, version_id):
398
return list(self.parse_line_delta_iter(lines, version_id))
400
def get_fulltext_content(self, lines):
401
"""Extract just the content lines from a fulltext."""
404
def get_linedelta_content(self, lines):
405
"""Extract just the content from a line delta.
407
This doesn't return all of the extra information stored in a delta.
408
Only the actual content lines.
413
header = header.split(',')
414
count = int(header[2])
415
for i in xrange(count):
250
418
def lower_fulltext(self, content):
251
419
return content.text()
306
472
self.writable = (access_mode == 'w')
307
473
self.delta = delta
309
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
310
access_mode, create=create, file_mode=file_mode,
311
create_parent_dir=create_parent_dir, delay_create=delay_create,
313
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
314
access_mode, create=create and not len(self), file_mode=file_mode,
315
create_parent_dir=create_parent_dir, delay_create=delay_create,
475
self._max_delta_chain = 200
478
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
479
access_mode, create=create, file_mode=file_mode,
480
create_parent_dir=create_parent_dir, delay_create=delay_create,
484
if access_method is None:
485
_access = _KnitAccess(transport, relpath + DATA_SUFFIX, file_mode, dir_mode,
486
((create and not len(self)) and delay_create), create_parent_dir)
488
_access = access_method
489
if create and not len(self) and not delay_create:
491
self._data = _KnitData(_access)
318
493
def __repr__(self):
319
return '%s(%s)' % (self.__class__.__name__,
494
return '%s(%s)' % (self.__class__.__name__,
320
495
self.transport.abspath(self.filename))
322
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
323
"""See VersionedFile._add_delta()."""
324
self._check_add(version_id, []) # should we check the lines ?
325
self._check_versions_present(parents)
329
for parent in parents:
330
if not self.has_version(parent):
331
ghosts.append(parent)
333
present_parents.append(parent)
335
if delta_parent is None:
336
# reconstitute as full text.
337
assert len(delta) == 1 or len(delta) == 0
339
assert delta[0][0] == 0
340
assert delta[0][1] == 0, delta[0][1]
341
return super(KnitVersionedFile, self)._add_delta(version_id,
352
options.append('no-eol')
354
if delta_parent is not None:
355
# determine the current delta chain length.
356
# To speed the extract of texts the delta chain is limited
357
# to a fixed number of deltas. This should minimize both
358
# I/O and the time spend applying deltas.
360
delta_parents = [delta_parent]
362
parent = delta_parents[0]
363
method = self._index.get_method(parent)
364
if method == 'fulltext':
366
delta_parents = self._index.get_parents(parent)
368
if method == 'line-delta':
369
# did not find a fulltext in the delta limit.
370
# just do a normal insertion.
371
return super(KnitVersionedFile, self)._add_delta(version_id,
378
options.append('line-delta')
379
store_lines = self.factory.lower_line_delta(delta)
381
where, size = self._data.add_record(version_id, digest, store_lines)
382
self._index.add_version(version_id, options, where, size, parents)
497
def _check_should_delta(self, first_parents):
498
"""Iterate back through the parent listing, looking for a fulltext.
500
This is used when we want to decide whether to add a delta or a new
501
fulltext. It searches for _max_delta_chain parents. When it finds a
502
fulltext parent, it sees if the total size of the deltas leading up to
503
it is large enough to indicate that we want a new full text anyway.
505
Return True if we should create a new delta, False if we should use a
510
delta_parents = first_parents
511
for count in xrange(self._max_delta_chain):
512
parent = delta_parents[0]
513
method = self._index.get_method(parent)
514
index, pos, size = self._index.get_position(parent)
515
if method == 'fulltext':
519
delta_parents = self._index.get_parents(parent)
521
# We couldn't find a fulltext, so we must create a new one
524
return fulltext_size > delta_size
384
526
def _add_raw_records(self, records, data):
385
527
"""Add all the records 'records' with data pre-joined in 'data'.
428
571
return KnitVersionedFile(name, transport, factory=self.factory,
429
572
delta=self.delta, create=True)
431
def _fix_parents(self, version, new_parents):
432
"""Fix the parents list for version.
574
def get_data_stream(self, required_versions):
575
"""Get a data stream for the specified versions.
577
Versions may be returned in any order, not necessarily the order
580
:param required_versions: The exact set of versions to be extracted.
581
Unlike some other knit methods, this is not used to generate a
582
transitive closure, rather it is used precisely as given.
434
This is done by appending a new version to the index
435
with identical data except for the parents list.
436
the parents list must be a superset of the current
584
:returns: format_signature, list of (version, options, length, parents),
439
current_values = self._index._cache[version]
440
assert set(current_values[4]).difference(set(new_parents)) == set()
441
self._index.add_version(version,
587
if not isinstance(required_versions, set):
588
required_versions = set(required_versions)
589
# we don't care about inclusions, the caller cares.
590
# but we need to setup a list of records to visit.
591
for version_id in required_versions:
592
if not self.has_version(version_id):
593
raise RevisionNotPresent(version_id, self.filename)
594
# Pick the desired versions out of the index in oldest-to-newest order
596
for version_id in self.versions():
597
if version_id in required_versions:
598
version_list.append(version_id)
600
# create the list of version information for the result
601
copy_queue_records = []
603
result_version_list = []
604
for version_id in version_list:
605
options = self._index.get_options(version_id)
606
parents = self._index.get_parents_with_ghosts(version_id)
607
index_memo = self._index.get_position(version_id)
608
copy_queue_records.append((version_id, index_memo))
609
none, data_pos, data_size = index_memo
610
copy_set.add(version_id)
611
# version, options, length, parents
612
result_version_list.append((version_id, options, data_size,
615
# Read the compressed record data.
617
# From here down to the return should really be logic in the returned
618
# callable -- in a class that adapts read_records_iter_raw to read
621
for (version_id, raw_data), \
622
(version_id2, options, _, parents) in \
623
izip(self._data.read_records_iter_raw(copy_queue_records),
624
result_version_list):
625
assert version_id == version_id2, 'logic error, inconsistent results'
626
raw_datum.append(raw_data)
627
pseudo_file = StringIO(''.join(raw_datum))
630
return pseudo_file.read()
632
return pseudo_file.read(length)
633
return (self.get_format_signature(), result_version_list, read)
635
def _extract_blocks(self, version_id, source, target):
636
if self._index.get_method(version_id) != 'line-delta':
638
parent, sha1, noeol, delta = self.get_delta(version_id)
639
return KnitContent.get_line_delta_blocks(delta, source, target)
447
641
def get_delta(self, version_id):
448
642
"""Get a delta for constructing version from some other version."""
449
if not self.has_version(version_id):
450
raise RevisionNotPresent(version_id, self.filename)
643
self.check_not_reserved_id(version_id)
452
644
parents = self.get_parents(version_id)
454
646
parent = parents[0]
457
data_pos, data_size = self._index.get_position(version_id)
458
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
459
version_idx = self._index.lookup(version_id)
649
index_memo = self._index.get_position(version_id)
650
data, sha1 = self._data.read_records(((version_id, index_memo),))[version_id]
460
651
noeol = 'no-eol' in self._index.get_options(version_id)
461
652
if 'fulltext' == self._index.get_method(version_id):
462
new_content = self.factory.parse_fulltext(data, version_idx)
653
new_content = self.factory.parse_fulltext(data, version_id)
463
654
if parent is not None:
464
655
reference_content = self._get_content(parent)
465
656
old_texts = reference_content.text()
468
659
new_texts = new_content.text()
469
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
660
delta_seq = patiencediff.PatienceSequenceMatcher(None, old_texts,
470
662
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
472
delta = self.factory.parse_line_delta(data, version_idx)
664
delta = self.factory.parse_line_delta(data, version_id)
473
665
return parent, sha1, noeol, delta
667
def get_format_signature(self):
668
"""See VersionedFile.get_format_signature()."""
669
if self.factory.annotated:
670
annotated_part = "annotated"
672
annotated_part = "plain"
673
return "knit-%s" % (annotated_part,)
475
675
def get_graph_with_ghosts(self):
476
676
"""See VersionedFile.get_graph_with_ghosts()."""
708
def insert_data_stream(self, (format, data_list, reader_callable)):
709
"""Insert knit records from a data stream into this knit.
711
If a version in the stream is already present in this knit, it will not
712
be inserted a second time. It will be checked for consistency with the
713
stored version however, and may cause a KnitCorrupt error to be raised
714
if the data in the stream disagrees with the already stored data.
716
:seealso: get_data_stream
718
if format != self.get_format_signature():
719
trace.mutter('incompatible format signature inserting to %r', self)
720
raise KnitDataStreamIncompatible(
721
format, self.get_format_signature())
723
for version_id, options, length, parents in data_list:
724
if self.has_version(version_id):
725
# First check: the list of parents.
726
my_parents = self.get_parents_with_ghosts(version_id)
727
if my_parents != parents:
728
# XXX: KnitCorrupt is not quite the right exception here.
731
'parents list %r from data stream does not match '
732
'already recorded parents %r for %s'
733
% (parents, my_parents, version_id))
735
# Also check the SHA-1 of the fulltext this content will
737
raw_data = reader_callable(length)
738
my_fulltext_sha1 = self.get_sha1(version_id)
739
df, rec = self._data._parse_record_header(version_id, raw_data)
740
stream_fulltext_sha1 = rec[3]
741
if my_fulltext_sha1 != stream_fulltext_sha1:
742
# Actually, we don't know if it's this knit that's corrupt,
743
# or the data stream we're trying to insert.
745
self.filename, 'sha-1 does not match %s' % version_id)
747
if 'line-delta' in options:
748
# Make sure that this knit record is actually useful: a
749
# line-delta is no use unless we have its parent.
750
# Fetching from a broken repository with this problem
751
# shouldn't break the target repository.
752
if not self._index.has_version(parents[0]):
755
'line-delta from stream references '
756
'missing parent %s' % parents[0])
757
self._add_raw_records(
758
[(version_id, options, parents, length)],
759
reader_callable(length))
505
761
def versions(self):
506
762
"""See VersionedFile.versions."""
763
if 'evil' in debug.debug_flags:
764
trace.mutter_callsite(2, "versions scales with size of history")
507
765
return self._index.get_versions()
509
767
def has_version(self, version_id):
510
768
"""See VersionedFile.has_version."""
769
if 'evil' in debug.debug_flags:
770
trace.mutter_callsite(2, "has_version is a LBYL scenario")
511
771
return self._index.has_version(version_id)
513
773
__contains__ = has_version
515
775
def _merge_annotations(self, content, parents, parent_texts={},
516
delta=None, annotated=None):
776
delta=None, annotated=None,
777
left_matching_blocks=None):
517
778
"""Merge annotations for content. This is done by comparing
518
779
the annotations based on changed to the text.
781
if left_matching_blocks is not None:
782
delta_seq = diff._PrematchedMatcher(left_matching_blocks)
522
786
for parent_id in parents:
523
787
merge_content = self._get_content(parent_id, parent_texts)
524
seq = KnitSequenceMatcher(None, merge_content.text(), content.text())
525
if delta_seq is None:
526
# setup a delta seq to reuse.
788
if (parent_id == parents[0] and delta_seq is not None):
791
seq = patiencediff.PatienceSequenceMatcher(
792
None, merge_content.text(), content.text())
528
793
for i, j, n in seq.get_matching_blocks():
531
# this appears to copy (origin, text) pairs across to the new
532
# content for any line that matches the last-checked parent.
533
# FIXME: save the sequence control data for delta compression
534
# against the most relevant parent rather than rediffing.
796
# this appears to copy (origin, text) pairs across to the
797
# new content for any line that matches the last-checked
535
799
content._lines[j:j+n] = merge_content._lines[i:i+n]
801
if delta_seq is None:
538
802
reference_content = self._get_content(parents[0], parent_texts)
539
803
new_texts = content.text()
540
804
old_texts = reference_content.text()
541
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
805
delta_seq = patiencediff.PatienceSequenceMatcher(
806
None, old_texts, new_texts)
542
807
return self._make_line_delta(delta_seq, content)
544
809
def _make_line_delta(self, delta_seq, new_content):
593
857
def _check_versions_present(self, version_ids):
594
858
"""Check that all specified versions are present."""
595
version_ids = set(version_ids)
596
for r in list(version_ids):
597
if self._index.has_version(r):
598
version_ids.remove(r)
600
raise RevisionNotPresent(list(version_ids)[0], self.filename)
859
self._index.check_versions_present(version_ids)
602
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
861
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,
862
nostore_sha, random_id, check_content):
603
863
"""See VersionedFile.add_lines_with_ghosts()."""
604
self._check_add(version_id, lines)
605
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
864
self._check_add(version_id, lines, random_id, check_content)
865
return self._add(version_id, lines, parents, self.delta,
866
parent_texts, None, nostore_sha, random_id)
607
def _add_lines(self, version_id, parents, lines, parent_texts):
868
def _add_lines(self, version_id, parents, lines, parent_texts,
869
left_matching_blocks, nostore_sha, random_id, check_content):
608
870
"""See VersionedFile.add_lines."""
609
self._check_add(version_id, lines)
871
self._check_add(version_id, lines, random_id, check_content)
610
872
self._check_versions_present(parents)
611
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
873
return self._add(version_id, lines[:], parents, self.delta,
874
parent_texts, left_matching_blocks, nostore_sha, random_id)
613
def _check_add(self, version_id, lines):
876
def _check_add(self, version_id, lines, random_id, check_content):
614
877
"""check that version_id and lines are safe to add."""
615
assert self.writable, "knit is not opened for write"
616
### FIXME escape. RBC 20060228
617
878
if contains_whitespace(version_id):
618
879
raise InvalidRevisionId(version_id, self.filename)
619
if self.has_version(version_id):
880
self.check_not_reserved_id(version_id)
881
# Technically this could be avoided if we are happy to allow duplicate
882
# id insertion when other things than bzr core insert texts, but it
883
# seems useful for folk using the knit api directly to have some safety
884
# blanket that we can disable.
885
if not random_id and self.has_version(version_id):
620
886
raise RevisionAlreadyPresent(version_id, self.filename)
621
self._check_lines_not_unicode(lines)
622
self._check_lines_are_lines(lines)
888
self._check_lines_not_unicode(lines)
889
self._check_lines_are_lines(lines)
624
def _add(self, version_id, lines, parents, delta, parent_texts):
891
def _add(self, version_id, lines, parents, delta, parent_texts,
892
left_matching_blocks, nostore_sha, random_id):
625
893
"""Add a set of lines on top of version specified by parents.
627
895
If delta is true, compress the text as a line-delta against
630
898
Any versions not present will be converted into ghosts.
632
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
633
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
634
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
635
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
636
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
637
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
638
# +1383 0 8.0370 8.0370 +<len>
639
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
640
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
641
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
642
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
900
# first thing, if the content is something we don't need to store, find
902
line_bytes = ''.join(lines)
903
digest = sha_string(line_bytes)
904
if nostore_sha == digest:
905
raise errors.ExistingContent
644
907
present_parents = []
646
908
if parent_texts is None:
647
909
parent_texts = {}
648
910
for parent in parents:
649
if not self.has_version(parent):
650
ghosts.append(parent)
911
if self.has_version(parent):
652
912
present_parents.append(parent)
654
if delta and not len(present_parents):
914
# can only compress against the left most present parent.
916
(len(present_parents) == 0 or
917
present_parents[0] != parents[0])):
657
digest = sha_strings(lines)
920
text_length = len(line_bytes)
660
923
if lines[-1][-1] != '\n':
924
# copy the contents of lines.
661
926
options.append('no-eol')
662
927
lines[-1] = lines[-1] + '\n'
664
if len(present_parents) and delta:
665
931
# To speed the extract of texts the delta chain is limited
666
932
# to a fixed number of deltas. This should minimize both
667
933
# I/O and the time spend applying deltas.
669
delta_parents = present_parents
671
parent = delta_parents[0]
672
method = self._index.get_method(parent)
673
if method == 'fulltext':
675
delta_parents = self._index.get_parents(parent)
677
if method == 'line-delta':
934
delta = self._check_should_delta(present_parents)
680
lines = self.factory.make(lines, version_id)
936
assert isinstance(version_id, str)
937
content = self.factory.make(lines, version_id)
681
938
if delta or (self.factory.annotated and len(present_parents) > 0):
682
# Merge annotations from parent texts if so is needed.
683
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
684
delta, self.factory.annotated)
939
# Merge annotations from parent texts if needed.
940
delta_hunks = self._merge_annotations(content, present_parents,
941
parent_texts, delta, self.factory.annotated,
942
left_matching_blocks)
687
945
options.append('line-delta')
688
946
store_lines = self.factory.lower_line_delta(delta_hunks)
947
size, bytes = self._data._record_to_data(version_id, digest,
690
950
options.append('fulltext')
691
store_lines = self.factory.lower_fulltext(lines)
951
# isinstance is slower and we have no hierarchy.
952
if self.factory.__class__ == KnitPlainFactory:
953
# Use the already joined bytes saving iteration time in
955
size, bytes = self._data._record_to_data(version_id, digest,
958
# get mixed annotation + content and feed it into the
960
store_lines = self.factory.lower_fulltext(content)
961
size, bytes = self._data._record_to_data(version_id, digest,
693
where, size = self._data.add_record(version_id, digest, store_lines)
694
self._index.add_version(version_id, options, where, size, parents)
964
access_memo = self._data.add_raw_records([size], bytes)[0]
965
self._index.add_versions(
966
((version_id, options, access_memo, parents),),
968
return digest, text_length, content
697
970
def check(self, progress_bar=None):
698
971
"""See VersionedFile.check()."""
771
1051
if component_id in content_map:
772
1052
content = content_map[component_id]
774
version_idx = self._index.lookup(component_id)
775
1054
if method == 'fulltext':
776
1055
assert content is None
777
content = self.factory.parse_fulltext(data, version_idx)
1056
content = self.factory.parse_fulltext(data, version_id)
778
1057
elif method == 'line-delta':
779
delta = self.factory.parse_line_delta(data[:],
781
content = content.copy()
782
content._lines = self._apply_delta(content._lines,
784
content_map[component_id] = content
1058
delta = self.factory.parse_line_delta(data, version_id)
1059
if multiple_versions:
1060
# only doing this when we want multiple versions
1061
# output avoids list copies - which reference and
1062
# dereference many strings.
1063
content = content.copy()
1064
content.apply_delta(delta, version_id)
1065
if multiple_versions:
1066
content_map[component_id] = content
786
1068
if 'no-eol' in self._index.get_options(version_id):
787
content = content.copy()
788
line = content._lines[-1][1].rstrip('\n')
789
content._lines[-1] = (content._lines[-1][0], line)
1069
if multiple_versions:
1070
content = content.copy()
1071
content.strip_last_line_newline()
790
1072
final_content[version_id] = content
792
1074
# digest here is the digest from the last applied component.
793
1075
text = content.text()
794
if sha_strings(text) != digest:
795
raise KnitCorrupt(self.filename,
796
'sha-1 does not match %s' % version_id)
798
text_map[version_id] = text
799
return text_map, final_content
801
def iter_lines_added_or_present_in_versions(self, version_ids=None):
1076
actual_sha = sha_strings(text)
1077
if actual_sha != digest:
1078
raise KnitCorrupt(self.filename,
1080
'\n of reconstructed text does not match'
1082
'\n for version %s' %
1083
(actual_sha, digest, version_id))
1084
text_map[version_id] = text
1085
return text_map, final_content
1087
def iter_lines_added_or_present_in_versions(self, version_ids=None,
802
1089
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
803
1090
if version_ids is None:
804
1091
version_ids = self.versions()
1093
pb = progress.DummyProgress()
805
1094
# we don't care about inclusions, the caller cares.
806
1095
# but we need to setup a list of records to visit.
807
1096
# we need version_id, position, length
808
1097
version_id_records = []
809
requested_versions = list(version_ids)
1098
requested_versions = set(version_ids)
810
1099
# filter for available versions
811
1100
for version_id in requested_versions:
812
1101
if not self.has_version(version_id):
813
1102
raise RevisionNotPresent(version_id, self.filename)
814
1103
# get a in-component-order queue:
816
1104
for version_id in self.versions():
817
1105
if version_id in requested_versions:
818
version_ids.append(version_id)
819
data_pos, length = self._index.get_position(version_id)
820
version_id_records.append((version_id, data_pos, length))
1106
index_memo = self._index.get_position(version_id)
1107
version_id_records.append((version_id, index_memo))
822
pb = bzrlib.ui.ui_factory.nested_progress_bar()
824
1109
total = len(version_id_records)
826
pb.update('Walking content.', count, total)
827
for version_id, data, sha_value in \
828
self._data.read_records_iter(version_id_records):
829
pb.update('Walking content.', count, total)
830
method = self._index.get_method(version_id)
831
version_idx = self._index.lookup(version_id)
832
assert method in ('fulltext', 'line-delta')
833
if method == 'fulltext':
834
content = self.factory.parse_fulltext(data, version_idx)
835
for line in content.text():
838
delta = self.factory.parse_line_delta(data, version_idx)
839
for start, end, count, lines in delta:
840
for origin, line in lines:
843
pb.update('Walking content.', total, total)
846
pb.update('Walking content.', total, total)
1110
for version_idx, (version_id, data, sha_value) in \
1111
enumerate(self._data.read_records_iter(version_id_records)):
1112
pb.update('Walking content.', version_idx, total)
1113
method = self._index.get_method(version_id)
1115
assert method in ('fulltext', 'line-delta')
1116
if method == 'fulltext':
1117
line_iterator = self.factory.get_fulltext_content(data)
1119
line_iterator = self.factory.get_linedelta_content(data)
1120
for line in line_iterator:
1123
pb.update('Walking content.', total, total)
1125
def iter_parents(self, version_ids):
1126
"""Iterate through the parents for many version ids.
1128
:param version_ids: An iterable yielding version_ids.
1129
:return: An iterator that yields (version_id, parents). Requested
1130
version_ids not present in the versioned file are simply skipped.
1131
The order is undefined, allowing for different optimisations in
1132
the underlying implementation.
1134
return self._index.iter_parents(version_ids)
850
1136
def num_versions(self):
851
1137
"""See VersionedFile.num_versions()."""
852
1138
return self._index.num_versions()
1071
1308
# so - wc -l of a knit index is != the number of unique names
1073
1310
self._history = []
1074
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1312
fp = self._transport.get(self._filename)
1079
pb.update('read knit index', count, total)
1080
fp = self._transport.get(self._filename)
1082
self.check_header(fp)
1083
# readlines reads the whole file at once:
1084
# bad for transports like http, good for local disk
1085
# we save 60 ms doing this one change (
1086
# from calling readline each time to calling
1088
# probably what we want for nice behaviour on
1089
# http is a incremental readlines that yields, or
1090
# a check for local vs non local indexes,
1091
for l in fp.readlines():
1093
if len(rec) < 5 or rec[-1] != ':':
1095
# FIXME: in the future we should determine if its a
1096
# short write - and ignore it
1097
# or a different failure, and raise. RBC 20060407
1101
#pb.update('read knit index', count, total)
1102
# See self._parse_parents
1104
for value in rec[4:-1]:
1106
# uncompressed reference
1107
parents.append(value[1:])
1109
# this is 15/4000ms faster than isinstance,
1111
# this function is called thousands of times a
1112
# second so small variations add up.
1113
assert value.__class__ is str
1114
parents.append(self._history[int(value)])
1115
# end self._parse_parents
1116
# self._cache_version(rec[0],
1117
# rec[1].split(','),
1121
# --- self._cache_version
1122
# only want the _history index to reference the 1st
1123
# index entry for version_id
1125
if version_id not in self._cache:
1126
index = len(self._history)
1127
self._history.append(version_id)
1129
index = self._cache[version_id][5]
1130
self._cache[version_id] = (version_id,
1136
# --- self._cache_version
1139
except NoSuchFile, e:
1140
if mode != 'w' or not create:
1143
self._need_to_create = True
1145
self._transport.put_bytes_non_atomic(self._filename,
1146
self.HEADER, mode=self._file_mode)
1149
pb.update('read knit index', total, total)
1152
def _parse_parents(self, compressed_parents):
1153
"""convert a list of string parent values into version ids.
1155
ints are looked up in the index.
1156
.FOO values are ghosts and converted in to FOO.
1158
NOTE: the function is retained here for clarity, and for possible
1159
use in partial index reads. However bulk processing now has
1160
it inlined in __init__ for inner-loop optimisation.
1163
for value in compressed_parents:
1164
if value[-1] == '.':
1165
# uncompressed reference
1166
result.append(value[1:])
1314
# _load_data may raise NoSuchFile if the target knit is
1316
_load_data(self, fp)
1320
if mode != 'w' or not create:
1323
self._need_to_create = True
1168
# this is 15/4000ms faster than isinstance,
1169
# this function is called thousands of times a
1170
# second so small variations add up.
1171
assert value.__class__ is str
1172
result.append(self._history[int(value)])
1325
self._transport.put_bytes_non_atomic(
1326
self._filename, self.HEADER, mode=self._file_mode)
1175
1328
def get_graph(self):
1177
for version_id, index in self._cache.iteritems():
1178
graph.append((version_id, index[4]))
1329
"""Return a list of the node:parents lists from this knit index."""
1330
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1181
def get_ancestry(self, versions):
1332
def get_ancestry(self, versions, topo_sorted=True):
1182
1333
"""See VersionedFile.get_ancestry."""
1183
1334
# get a graph of all the mentioned versions:
1185
1336
pending = set(versions)
1187
1339
version = pending.pop()
1188
parents = self._cache[version][4]
1189
# got the parents ok
1191
parents = [parent for parent in parents if parent in self._cache]
1192
for parent in parents:
1193
# if not completed and not a ghost
1194
if parent not in graph:
1342
parents = [p for p in cache[version][4] if p in cache]
1344
raise RevisionNotPresent(version, self._filename)
1345
# if not completed and not a ghost
1346
pending.update([p for p in parents if p not in graph])
1196
1347
graph[version] = parents
1197
1350
return topo_sort(graph.items())
1199
1352
def get_ancestry_with_ghosts(self, versions):
1200
1353
"""See VersionedFile.get_ancestry_with_ghosts."""
1201
1354
# get a graph of all the mentioned versions:
1355
self.check_versions_present(versions)
1203
1358
pending = set(versions)
1205
1360
version = pending.pop()
1207
parents = self._cache[version][4]
1362
parents = cache[version][4]
1208
1363
except KeyError:
1209
1364
# ghost, fake it
1210
1365
graph[version] = []
1213
# got the parents ok
1214
for parent in parents:
1215
if parent not in graph:
1368
pending.update([p for p in parents if p not in graph])
1217
1369
graph[version] = parents
1218
1370
return topo_sort(graph.items())
1372
def iter_parents(self, version_ids):
1373
"""Iterate through the parents for many version ids.
1375
:param version_ids: An iterable yielding version_ids.
1376
:return: An iterator that yields (version_id, parents). Requested
1377
version_ids not present in the versioned file are simply skipped.
1378
The order is undefined, allowing for different optimisations in
1379
the underlying implementation.
1381
for version_id in version_ids:
1383
yield version_id, tuple(self.get_parents(version_id))
1220
1387
def num_versions(self):
1221
1388
return len(self._history)
1223
1390
__len__ = num_versions
1225
1392
def get_versions(self):
1393
"""Get all the versions in the file. not topologically sorted."""
1226
1394
return self._history
1228
def idx_to_name(self, idx):
1229
return self._history[idx]
1231
def lookup(self, version_id):
1232
assert version_id in self._cache
1233
return self._cache[version_id][5]
1235
1396
def _version_list_to_index(self, versions):
1236
encode_utf8 = cache_utf8.encode
1237
1397
result_list = []
1238
1399
for version in versions:
1239
if version in self._cache:
1400
if version in cache:
1240
1401
# -- inlined lookup() --
1241
result_list.append(str(self._cache[version][5]))
1402
result_list.append(str(cache[version][5]))
1242
1403
# -- end lookup () --
1244
result_list.append('.' + encode_utf8(version))
1405
result_list.append('.' + version)
1245
1406
return ' '.join(result_list)
1247
def add_version(self, version_id, options, pos, size, parents):
1408
def add_version(self, version_id, options, index_memo, parents):
1248
1409
"""Add a version record to the index."""
1249
self.add_versions(((version_id, options, pos, size, parents),))
1410
self.add_versions(((version_id, options, index_memo, parents),))
1251
def add_versions(self, versions):
1412
def add_versions(self, versions, random_id=False):
1252
1413
"""Add multiple versions to the index.
1254
1415
:param versions: a list of tuples:
1255
1416
(version_id, options, pos, size, parents).
1417
:param random_id: If True the ids being added were randomly generated
1418
and no check for existence will be performed.
1258
encode_utf8 = cache_utf8.encode
1259
for version_id, options, pos, size, parents in versions:
1260
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1264
self._version_list_to_index(parents))
1265
assert isinstance(line, str), \
1266
'content must be utf-8 encoded: %r' % (line,)
1268
if not self._need_to_create:
1269
self._transport.append_bytes(self._filename, ''.join(lines))
1272
sio.write(self.HEADER)
1273
sio.writelines(lines)
1275
self._transport.put_file_non_atomic(self._filename, sio,
1276
create_parent_dir=self._create_parent_dir,
1277
mode=self._file_mode,
1278
dir_mode=self._dir_mode)
1279
self._need_to_create = False
1281
# cache after writing, so that a failed write leads to missing cache
1282
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1283
# failure, reload the index or flush it or some such, to prevent
1284
# writing records that did complete twice.
1285
for version_id, options, pos, size, parents in versions:
1286
self._cache_version(version_id, options, pos, size, parents)
1421
orig_history = self._history[:]
1422
orig_cache = self._cache.copy()
1425
for version_id, options, (index, pos, size), parents in versions:
1426
line = "\n%s %s %s %s %s :" % (version_id,
1430
self._version_list_to_index(parents))
1431
assert isinstance(line, str), \
1432
'content must be utf-8 encoded: %r' % (line,)
1434
self._cache_version(version_id, options, pos, size, parents)
1435
if not self._need_to_create:
1436
self._transport.append_bytes(self._filename, ''.join(lines))
1439
sio.write(self.HEADER)
1440
sio.writelines(lines)
1442
self._transport.put_file_non_atomic(self._filename, sio,
1443
create_parent_dir=self._create_parent_dir,
1444
mode=self._file_mode,
1445
dir_mode=self._dir_mode)
1446
self._need_to_create = False
1448
# If any problems happen, restore the original values and re-raise
1449
self._history = orig_history
1450
self._cache = orig_cache
1288
1453
def has_version(self, version_id):
1289
1454
"""True if the version is in the index."""
1290
return (version_id in self._cache)
1455
return version_id in self._cache
1292
1457
def get_position(self, version_id):
1293
"""Return data position and size of specified version."""
1294
return (self._cache[version_id][2], \
1295
self._cache[version_id][3])
1458
"""Return details needed to access the version.
1460
.kndx indices do not support split-out data, so return None for the
1463
:return: a tuple (None, data position, size) to hand to the access
1464
logic to get the record.
1466
entry = self._cache[version_id]
1467
return None, entry[2], entry[3]
1297
1469
def get_method(self, version_id):
1298
1470
"""Return compression method of specified version."""
1299
options = self._cache[version_id][1]
1472
options = self._cache[version_id][1]
1474
raise RevisionNotPresent(version_id, self._filename)
1300
1475
if 'fulltext' in options:
1301
1476
return 'fulltext'
1303
assert 'line-delta' in options
1478
if 'line-delta' not in options:
1479
raise errors.KnitIndexUnknownMethod(self._full_path(), options)
1304
1480
return 'line-delta'
1306
1482
def get_options(self, version_id):
1483
"""Return a string represention options.
1307
1487
return self._cache[version_id][1]
1309
1489
def get_parents(self, version_id):
1318
1498
def check_versions_present(self, version_ids):
1319
1499
"""Check that all specified versions are present."""
1320
version_ids = set(version_ids)
1321
for version_id in list(version_ids):
1322
if version_id in self._cache:
1323
version_ids.remove(version_id)
1325
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1328
class _KnitData(_KnitComponentFile):
1329
"""Contents of the knit data file"""
1331
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1332
create_parent_dir=False, delay_create=False,
1334
_KnitComponentFile.__init__(self, transport, filename, mode,
1335
file_mode=file_mode,
1336
create_parent_dir=create_parent_dir,
1501
for version_id in version_ids:
1502
if version_id not in cache:
1503
raise RevisionNotPresent(version_id, self._filename)
1506
class KnitGraphIndex(object):
1507
"""A knit index that builds on GraphIndex."""
1509
def __init__(self, graph_index, deltas=False, parents=True, add_callback=None):
1510
"""Construct a KnitGraphIndex on a graph_index.
1512
:param graph_index: An implementation of bzrlib.index.GraphIndex.
1513
:param deltas: Allow delta-compressed records.
1514
:param add_callback: If not None, allow additions to the index and call
1515
this callback with a list of added GraphIndex nodes:
1516
[(node, value, node_refs), ...]
1517
:param parents: If True, record knits parents, if not do not record
1520
self._graph_index = graph_index
1521
self._deltas = deltas
1522
self._add_callback = add_callback
1523
self._parents = parents
1524
if deltas and not parents:
1525
raise KnitCorrupt(self, "Cannot do delta compression without "
1528
def _get_entries(self, keys, check_present=False):
1529
"""Get the entries for keys.
1531
:param keys: An iterable of index keys, - 1-tuples.
1536
for node in self._graph_index.iter_entries(keys):
1538
found_keys.add(node[1])
1540
# adapt parentless index to the rest of the code.
1541
for node in self._graph_index.iter_entries(keys):
1542
yield node[0], node[1], node[2], ()
1543
found_keys.add(node[1])
1545
missing_keys = keys.difference(found_keys)
1547
raise RevisionNotPresent(missing_keys.pop(), self)
1549
def _present_keys(self, version_ids):
1551
node[1] for node in self._get_entries(version_ids)])
1553
def _parentless_ancestry(self, versions):
1554
"""Honour the get_ancestry API for parentless knit indices."""
1555
wanted_keys = self._version_ids_to_keys(versions)
1556
present_keys = self._present_keys(wanted_keys)
1557
missing = set(wanted_keys).difference(present_keys)
1559
raise RevisionNotPresent(missing.pop(), self)
1560
return list(self._keys_to_version_ids(present_keys))
1562
def get_ancestry(self, versions, topo_sorted=True):
1563
"""See VersionedFile.get_ancestry."""
1564
if not self._parents:
1565
return self._parentless_ancestry(versions)
1566
# XXX: This will do len(history) index calls - perhaps
1567
# it should be altered to be a index core feature?
1568
# get a graph of all the mentioned versions:
1571
versions = self._version_ids_to_keys(versions)
1572
pending = set(versions)
1574
# get all pending nodes
1575
this_iteration = pending
1576
new_nodes = self._get_entries(this_iteration)
1579
for (index, key, value, node_refs) in new_nodes:
1580
# dont ask for ghosties - otherwise
1581
# we we can end up looping with pending
1582
# being entirely ghosted.
1583
graph[key] = [parent for parent in node_refs[0]
1584
if parent not in ghosts]
1586
for parent in graph[key]:
1587
# dont examine known nodes again
1592
ghosts.update(this_iteration.difference(found))
1593
if versions.difference(graph):
1594
raise RevisionNotPresent(versions.difference(graph).pop(), self)
1596
result_keys = topo_sort(graph.items())
1598
result_keys = graph.iterkeys()
1599
return [key[0] for key in result_keys]
1601
def get_ancestry_with_ghosts(self, versions):
1602
"""See VersionedFile.get_ancestry."""
1603
if not self._parents:
1604
return self._parentless_ancestry(versions)
1605
# XXX: This will do len(history) index calls - perhaps
1606
# it should be altered to be a index core feature?
1607
# get a graph of all the mentioned versions:
1609
versions = self._version_ids_to_keys(versions)
1610
pending = set(versions)
1612
# get all pending nodes
1613
this_iteration = pending
1614
new_nodes = self._get_entries(this_iteration)
1616
for (index, key, value, node_refs) in new_nodes:
1617
graph[key] = node_refs[0]
1619
for parent in graph[key]:
1620
# dont examine known nodes again
1624
missing_versions = this_iteration.difference(graph)
1625
missing_needed = versions.intersection(missing_versions)
1627
raise RevisionNotPresent(missing_needed.pop(), self)
1628
for missing_version in missing_versions:
1629
# add a key, no parents
1630
graph[missing_version] = []
1631
pending.discard(missing_version) # don't look for it
1632
result_keys = topo_sort(graph.items())
1633
return [key[0] for key in result_keys]
1635
def get_graph(self):
1636
"""Return a list of the node:parents lists from this knit index."""
1637
if not self._parents:
1638
return [(key, ()) for key in self.get_versions()]
1640
for index, key, value, refs in self._graph_index.iter_all_entries():
1641
result.append((key[0], tuple([ref[0] for ref in refs[0]])))
1644
def iter_parents(self, version_ids):
1645
"""Iterate through the parents for many version ids.
1647
:param version_ids: An iterable yielding version_ids.
1648
:return: An iterator that yields (version_id, parents). Requested
1649
version_ids not present in the versioned file are simply skipped.
1650
The order is undefined, allowing for different optimisations in
1651
the underlying implementation.
1654
all_nodes = set(self._get_entries(self._version_ids_to_keys(version_ids)))
1656
present_parents = set()
1657
for node in all_nodes:
1658
all_parents.update(node[3][0])
1659
# any node we are querying must be present
1660
present_parents.add(node[1])
1661
unknown_parents = all_parents.difference(present_parents)
1662
present_parents.update(self._present_keys(unknown_parents))
1663
for node in all_nodes:
1665
for parent in node[3][0]:
1666
if parent in present_parents:
1667
parents.append(parent[0])
1668
yield node[1][0], tuple(parents)
1670
for node in self._get_entries(self._version_ids_to_keys(version_ids)):
1671
yield node[1][0], ()
1673
def num_versions(self):
1674
return len(list(self._graph_index.iter_all_entries()))
1676
__len__ = num_versions
1678
def get_versions(self):
1679
"""Get all the versions in the file. not topologically sorted."""
1680
return [node[1][0] for node in self._graph_index.iter_all_entries()]
1682
def has_version(self, version_id):
1683
"""True if the version is in the index."""
1684
return len(self._present_keys(self._version_ids_to_keys([version_id]))) == 1
1686
def _keys_to_version_ids(self, keys):
1687
return tuple(key[0] for key in keys)
1689
def get_position(self, version_id):
1690
"""Return details needed to access the version.
1692
:return: a tuple (index, data position, size) to hand to the access
1693
logic to get the record.
1695
node = self._get_node(version_id)
1696
bits = node[2][1:].split(' ')
1697
return node[0], int(bits[0]), int(bits[1])
1699
def get_method(self, version_id):
1700
"""Return compression method of specified version."""
1701
if not self._deltas:
1703
return self._parent_compression(self._get_node(version_id)[3][1])
1705
def _parent_compression(self, reference_list):
1706
# use the second reference list to decide if this is delta'd or not.
1707
if len(reference_list):
1712
def _get_node(self, version_id):
1714
return list(self._get_entries(self._version_ids_to_keys([version_id])))[0]
1716
raise RevisionNotPresent(version_id, self)
1718
def get_options(self, version_id):
1719
"""Return a string represention options.
1723
node = self._get_node(version_id)
1724
if not self._deltas:
1725
options = ['fulltext']
1727
options = [self._parent_compression(node[3][1])]
1728
if node[2][0] == 'N':
1729
options.append('no-eol')
1732
def get_parents(self, version_id):
1733
"""Return parents of specified version ignoring ghosts."""
1734
parents = list(self.iter_parents([version_id]))
1737
raise errors.RevisionNotPresent(version_id, self)
1738
return parents[0][1]
1740
def get_parents_with_ghosts(self, version_id):
1741
"""Return parents of specified version with ghosts."""
1742
nodes = list(self._get_entries(self._version_ids_to_keys([version_id]),
1743
check_present=True))
1744
if not self._parents:
1746
return self._keys_to_version_ids(nodes[0][3][0])
1748
def check_versions_present(self, version_ids):
1749
"""Check that all specified versions are present."""
1750
keys = self._version_ids_to_keys(version_ids)
1751
present = self._present_keys(keys)
1752
missing = keys.difference(present)
1754
raise RevisionNotPresent(missing.pop(), self)
1756
def add_version(self, version_id, options, access_memo, parents):
1757
"""Add a version record to the index."""
1758
return self.add_versions(((version_id, options, access_memo, parents),))
1760
def add_versions(self, versions, random_id=False):
1761
"""Add multiple versions to the index.
1763
This function does not insert data into the Immutable GraphIndex
1764
backing the KnitGraphIndex, instead it prepares data for insertion by
1765
the caller and checks that it is safe to insert then calls
1766
self._add_callback with the prepared GraphIndex nodes.
1768
:param versions: a list of tuples:
1769
(version_id, options, pos, size, parents).
1770
:param random_id: If True the ids being added were randomly generated
1771
and no check for existence will be performed.
1773
if not self._add_callback:
1774
raise errors.ReadOnlyError(self)
1775
# we hope there are no repositories with inconsistent parentage
1780
for (version_id, options, access_memo, parents) in versions:
1781
index, pos, size = access_memo
1782
key = (version_id, )
1783
parents = tuple((parent, ) for parent in parents)
1784
if 'no-eol' in options:
1788
value += "%d %d" % (pos, size)
1789
if not self._deltas:
1790
if 'line-delta' in options:
1791
raise KnitCorrupt(self, "attempt to add line-delta in non-delta knit")
1794
if 'line-delta' in options:
1795
node_refs = (parents, (parents[0],))
1797
node_refs = (parents, ())
1799
node_refs = (parents, )
1802
raise KnitCorrupt(self, "attempt to add node with parents "
1803
"in parentless index.")
1805
keys[key] = (value, node_refs)
1807
present_nodes = self._get_entries(keys)
1808
for (index, key, value, node_refs) in present_nodes:
1809
if (value, node_refs) != keys[key]:
1810
raise KnitCorrupt(self, "inconsistent details in add_versions"
1811
": %s %s" % ((value, node_refs), keys[key]))
1815
for key, (value, node_refs) in keys.iteritems():
1816
result.append((key, value, node_refs))
1818
for key, (value, node_refs) in keys.iteritems():
1819
result.append((key, value))
1820
self._add_callback(result)
1822
def _version_ids_to_keys(self, version_ids):
1823
return set((version_id, ) for version_id in version_ids)
1826
class _KnitAccess(object):
1827
"""Access to knit records in a .knit file."""
1829
def __init__(self, transport, filename, _file_mode, _dir_mode,
1830
_need_to_create, _create_parent_dir):
1831
"""Create a _KnitAccess for accessing and inserting data.
1833
:param transport: The transport the .knit is located on.
1834
:param filename: The filename of the .knit.
1836
self._transport = transport
1837
self._filename = filename
1838
self._file_mode = _file_mode
1839
self._dir_mode = _dir_mode
1840
self._need_to_create = _need_to_create
1841
self._create_parent_dir = _create_parent_dir
1843
def add_raw_records(self, sizes, raw_data):
1844
"""Add raw knit bytes to a storage area.
1846
The data is spooled to whereever the access method is storing data.
1848
:param sizes: An iterable containing the size of each raw data segment.
1849
:param raw_data: A bytestring containing the data.
1850
:return: A list of memos to retrieve the record later. Each memo is a
1851
tuple - (index, pos, length), where the index field is always None
1852
for the .knit access method.
1854
assert type(raw_data) == str, \
1855
'data must be plain bytes was %s' % type(raw_data)
1856
if not self._need_to_create:
1857
base = self._transport.append_bytes(self._filename, raw_data)
1859
self._transport.put_bytes_non_atomic(self._filename, raw_data,
1860
create_parent_dir=self._create_parent_dir,
1861
mode=self._file_mode,
1862
dir_mode=self._dir_mode)
1863
self._need_to_create = False
1867
result.append((None, base, size))
1872
"""IFF this data access has its own storage area, initialise it.
1876
self._transport.put_bytes_non_atomic(self._filename, '',
1877
mode=self._file_mode)
1879
def open_file(self):
1880
"""IFF this data access can be represented as a single file, open it.
1882
For knits that are not mapped to a single file on disk this will
1885
:return: None or a file handle.
1888
return self._transport.get(self._filename)
1893
def get_raw_records(self, memos_for_retrieval):
1894
"""Get the raw bytes for a records.
1896
:param memos_for_retrieval: An iterable containing the (index, pos,
1897
length) memo for retrieving the bytes. The .knit method ignores
1898
the index as there is always only a single file.
1899
:return: An iterator over the bytes of the records.
1901
read_vector = [(pos, size) for (index, pos, size) in memos_for_retrieval]
1902
for pos, data in self._transport.readv(self._filename, read_vector):
1906
class _PackAccess(object):
1907
"""Access to knit records via a collection of packs."""
1909
def __init__(self, index_to_packs, writer=None):
1910
"""Create a _PackAccess object.
1912
:param index_to_packs: A dict mapping index objects to the transport
1913
and file names for obtaining data.
1914
:param writer: A tuple (pack.ContainerWriter, write_index) which
1915
contains the pack to write, and the index that reads from it will
1919
self.container_writer = writer[0]
1920
self.write_index = writer[1]
1922
self.container_writer = None
1923
self.write_index = None
1924
self.indices = index_to_packs
1926
def add_raw_records(self, sizes, raw_data):
1927
"""Add raw knit bytes to a storage area.
1929
The data is spooled to the container writer in one bytes-record per
1932
:param sizes: An iterable containing the size of each raw data segment.
1933
:param raw_data: A bytestring containing the data.
1934
:return: A list of memos to retrieve the record later. Each memo is a
1935
tuple - (index, pos, length), where the index field is the
1936
write_index object supplied to the PackAccess object.
1938
assert type(raw_data) == str, \
1939
'data must be plain bytes was %s' % type(raw_data)
1943
p_offset, p_length = self.container_writer.add_bytes_record(
1944
raw_data[offset:offset+size], [])
1946
result.append((self.write_index, p_offset, p_length))
1950
"""Pack based knits do not get individually created."""
1952
def get_raw_records(self, memos_for_retrieval):
1953
"""Get the raw bytes for a records.
1955
:param memos_for_retrieval: An iterable containing the (index, pos,
1956
length) memo for retrieving the bytes. The Pack access method
1957
looks up the pack to use for a given record in its index_to_pack
1959
:return: An iterator over the bytes of the records.
1961
# first pass, group into same-index requests
1963
current_index = None
1964
for (index, offset, length) in memos_for_retrieval:
1965
if current_index == index:
1966
current_list.append((offset, length))
1968
if current_index is not None:
1969
request_lists.append((current_index, current_list))
1970
current_index = index
1971
current_list = [(offset, length)]
1972
# handle the last entry
1973
if current_index is not None:
1974
request_lists.append((current_index, current_list))
1975
for index, offsets in request_lists:
1976
transport, path = self.indices[index]
1977
reader = pack.make_readv_reader(transport, path, offsets)
1978
for names, read_func in reader.iter_records():
1979
yield read_func(None)
1981
def open_file(self):
1982
"""Pack based knits have no single file."""
1985
def set_writer(self, writer, index, (transport, packname)):
1986
"""Set a writer to use for adding data."""
1987
if index is not None:
1988
self.indices[index] = (transport, packname)
1989
self.container_writer = writer
1990
self.write_index = index
1993
class _KnitData(object):
1994
"""Manage extraction of data from a KnitAccess, caching and decompressing.
1996
The KnitData class provides the logic for parsing and using knit records,
1997
making use of an access method for the low level read and write operations.
2000
def __init__(self, access):
2001
"""Create a KnitData object.
2003
:param access: The access method to use. Access methods such as
2004
_KnitAccess manage the insertion of raw records and the subsequent
2005
retrieval of the same.
2007
self._access = access
1338
2008
self._checked = False
1339
2009
# TODO: jam 20060713 conceptually, this could spill to disk
1340
2010
# if the cached size gets larger than a certain amount
1752
2387
InterVersionedFile.register_optimiser(WeaveToKnit)
1755
class KnitSequenceMatcher(difflib.SequenceMatcher):
1756
"""Knit tuned sequence matcher.
1758
This is based on profiling of difflib which indicated some improvements
1759
for our usage pattern.
2390
# Deprecated, use PatienceSequenceMatcher instead
2391
KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher
2394
def annotate_knit(knit, revision_id):
2395
"""Annotate a knit with no cached annotations.
2397
This implementation is for knits with no cached annotations.
2398
It will work for knits with cached annotations, but this is not
1762
def find_longest_match(self, alo, ahi, blo, bhi):
1763
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1765
If isjunk is not defined:
1767
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1768
alo <= i <= i+k <= ahi
1769
blo <= j <= j+k <= bhi
1770
and for all (i',j',k') meeting those conditions,
1773
and if i == i', j <= j'
1775
In other words, of all maximal matching blocks, return one that
1776
starts earliest in a, and of all those maximal matching blocks that
1777
start earliest in a, return the one that starts earliest in b.
1779
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1780
>>> s.find_longest_match(0, 5, 0, 9)
1783
If isjunk is defined, first the longest matching block is
1784
determined as above, but with the additional restriction that no
1785
junk element appears in the block. Then that block is extended as
1786
far as possible by matching (only) junk elements on both sides. So
1787
the resulting block never matches on junk except as identical junk
1788
happens to be adjacent to an "interesting" match.
1790
Here's the same example as before, but considering blanks to be
1791
junk. That prevents " abcd" from matching the " abcd" at the tail
1792
end of the second sequence directly. Instead only the "abcd" can
1793
match, and matches the leftmost "abcd" in the second sequence:
1795
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1796
>>> s.find_longest_match(0, 5, 0, 9)
1799
If no blocks match, return (alo, blo, 0).
1801
>>> s = SequenceMatcher(None, "ab", "c")
1802
>>> s.find_longest_match(0, 2, 0, 1)
1806
# CAUTION: stripping common prefix or suffix would be incorrect.
1810
# Longest matching block is "ab", but if common prefix is
1811
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1812
# strip, so ends up claiming that ab is changed to acab by
1813
# inserting "ca" in the middle. That's minimal but unintuitive:
1814
# "it's obvious" that someone inserted "ac" at the front.
1815
# Windiff ends up at the same place as diff, but by pairing up
1816
# the unique 'b's and then matching the first two 'a's.
1818
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1819
besti, bestj, bestsize = alo, blo, 0
1820
# find longest junk-free match
1821
# during an iteration of the loop, j2len[j] = length of longest
1822
# junk-free match ending with a[i-1] and b[j]
1826
for i in xrange(alo, ahi):
1827
# look at all instances of a[i] in b; note that because
1828
# b2j has no junk keys, the loop is skipped if a[i] is junk
1829
j2lenget = j2len.get
1832
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1833
# following improvement
1834
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1835
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1836
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1838
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1839
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1840
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1852
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1854
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1857
# Extend the best by non-junk elements on each end. In particular,
1858
# "popular" non-junk elements aren't in b2j, which greatly speeds
1859
# the inner loop above, but also means "the best" match so far
1860
# doesn't contain any junk *or* popular non-junk elements.
1861
while besti > alo and bestj > blo and \
1862
not isbjunk(b[bestj-1]) and \
1863
a[besti-1] == b[bestj-1]:
1864
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1865
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1866
not isbjunk(b[bestj+bestsize]) and \
1867
a[besti+bestsize] == b[bestj+bestsize]:
1870
# Now that we have a wholly interesting match (albeit possibly
1871
# empty!), we may as well suck up the matching junk on each
1872
# side of it too. Can't think of a good reason not to, and it
1873
# saves post-processing the (possibly considerable) expense of
1874
# figuring out what to do with it. In the case of an empty
1875
# interesting match, this is clearly the right thing to do,
1876
# because no other kind of match is possible in the regions.
1877
while besti > alo and bestj > blo and \
1878
isbjunk(b[bestj-1]) and \
1879
a[besti-1] == b[bestj-1]:
1880
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1881
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1882
isbjunk(b[bestj+bestsize]) and \
1883
a[besti+bestsize] == b[bestj+bestsize]:
1884
bestsize = bestsize + 1
1886
return besti, bestj, bestsize
2401
ancestry = knit.get_ancestry(revision_id)
2402
fulltext = dict(zip(ancestry, knit.get_line_list(ancestry)))
2404
for candidate in ancestry:
2405
if candidate in annotations:
2407
parents = knit.get_parents(candidate)
2408
if len(parents) == 0:
2410
elif knit._index.get_method(candidate) != 'line-delta':
2413
parent, sha1, noeol, delta = knit.get_delta(candidate)
2414
blocks = KnitContent.get_line_delta_blocks(delta,
2415
fulltext[parents[0]], fulltext[candidate])
2416
annotations[candidate] = list(annotate.reannotate([annotations[p]
2417
for p in parents], fulltext[candidate], candidate, blocks))
2418
return iter(annotations[revision_id])
2422
from bzrlib._knit_load_data_c import _load_data_c as _load_data
2424
from bzrlib._knit_load_data_py import _load_data_py as _load_data