192
178
yield s_pos + (target_len - t_pos), target_len, 0
195
class _KnitFactory(object):
196
"""Base factory for creating content objects."""
198
def make(self, lines, version_id):
199
num_lines = len(lines)
200
return KnitContent(zip([version_id] * num_lines, lines))
203
class KnitAnnotateFactory(_KnitFactory):
181
class AnnotatedKnitContent(KnitContent):
182
"""Annotated content."""
184
def __init__(self, lines):
187
def annotate_iter(self):
188
"""Yield tuples of (origin, text) for each content line."""
189
return iter(self._lines)
191
def strip_last_line_newline(self):
192
line = self._lines[-1][1].rstrip('\n')
193
self._lines[-1] = (self._lines[-1][0], line)
196
return [text for origin, text in self._lines]
199
return AnnotatedKnitContent(self._lines[:])
202
class PlainKnitContent(KnitContent):
203
"""Unannotated content.
205
When annotate[_iter] is called on this content, the same version is reported
206
for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent
210
def __init__(self, lines, version_id):
212
self._version_id = version_id
214
def annotate_iter(self):
215
"""Yield tuples of (origin, text) for each content line."""
216
for line in self._lines:
217
yield self._version_id, line
220
return PlainKnitContent(self._lines[:], self._version_id)
222
def strip_last_line_newline(self):
223
self._lines[-1] = self._lines[-1].rstrip('\n')
229
class KnitAnnotateFactory(object):
204
230
"""Factory for creating annotated Content objects."""
234
def make(self, lines, version_id):
235
num_lines = len(lines)
236
return AnnotatedKnitContent(zip([version_id] * num_lines, lines))
208
238
def parse_fulltext(self, content, version_id):
209
239
"""Convert fulltext to internal representation
454
487
return fulltext_size > delta_size
456
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
457
"""See VersionedFile._add_delta()."""
458
self._check_add(version_id, []) # should we check the lines ?
459
self._check_versions_present(parents)
463
for parent in parents:
464
if not self.has_version(parent):
465
ghosts.append(parent)
467
present_parents.append(parent)
469
if delta_parent is None:
470
# reconstitute as full text.
471
assert len(delta) == 1 or len(delta) == 0
473
assert delta[0][0] == 0
474
assert delta[0][1] == 0, delta[0][1]
475
return super(KnitVersionedFile, self)._add_delta(version_id,
486
options.append('no-eol')
488
if delta_parent is not None:
489
# determine the current delta chain length.
490
# To speed the extract of texts the delta chain is limited
491
# to a fixed number of deltas. This should minimize both
492
# I/O and the time spend applying deltas.
493
# The window was changed to a maximum of 200 deltas, but also added
494
# was a check that the total compressed size of the deltas is
495
# smaller than the compressed size of the fulltext.
496
if not self._check_should_delta([delta_parent]):
497
# We don't want a delta here, just do a normal insertion.
498
return super(KnitVersionedFile, self)._add_delta(version_id,
505
options.append('line-delta')
506
store_lines = self.factory.lower_line_delta(delta)
508
access_memo = self._data.add_record(version_id, digest, store_lines)
509
self._index.add_version(version_id, options, access_memo, parents)
511
489
def _add_raw_records(self, records, data):
512
490
"""Add all the records 'records' with data pre-joined in 'data'.
855
830
"""Check that all specified versions are present."""
856
831
self._index.check_versions_present(version_ids)
858
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
833
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,
834
nostore_sha, random_id, check_content):
859
835
"""See VersionedFile.add_lines_with_ghosts()."""
860
self._check_add(version_id, lines)
861
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
836
self._check_add(version_id, lines, random_id, check_content)
837
return self._add(version_id, lines, parents, self.delta,
838
parent_texts, None, nostore_sha)
863
840
def _add_lines(self, version_id, parents, lines, parent_texts,
864
left_matching_blocks=None):
841
left_matching_blocks, nostore_sha, random_id, check_content):
865
842
"""See VersionedFile.add_lines."""
866
self._check_add(version_id, lines)
843
self._check_add(version_id, lines, random_id, check_content)
867
844
self._check_versions_present(parents)
868
845
return self._add(version_id, lines[:], parents, self.delta,
869
parent_texts, left_matching_blocks)
846
parent_texts, left_matching_blocks, nostore_sha)
871
def _check_add(self, version_id, lines):
848
def _check_add(self, version_id, lines, random_id, check_content):
872
849
"""check that version_id and lines are safe to add."""
873
assert self.writable, "knit is not opened for write"
874
### FIXME escape. RBC 20060228
875
850
if contains_whitespace(version_id):
876
851
raise InvalidRevisionId(version_id, self.filename)
877
852
self.check_not_reserved_id(version_id)
878
if self.has_version(version_id):
853
# Technically this could be avoided if we are happy to allow duplicate
854
# id insertion when other things than bzr core insert texts, but it
855
# seems useful for folk using the knit api directly to have some safety
856
# blanket that we can disable.
857
if not random_id and self.has_version(version_id):
879
858
raise RevisionAlreadyPresent(version_id, self.filename)
880
self._check_lines_not_unicode(lines)
881
self._check_lines_are_lines(lines)
860
self._check_lines_not_unicode(lines)
861
self._check_lines_are_lines(lines)
883
863
def _add(self, version_id, lines, parents, delta, parent_texts,
884
left_matching_blocks=None):
864
left_matching_blocks, nostore_sha):
885
865
"""Add a set of lines on top of version specified by parents.
887
867
If delta is true, compress the text as a line-delta against
902
882
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
904
884
present_parents = []
906
885
if parent_texts is None:
907
886
parent_texts = {}
908
887
for parent in parents:
909
if not self.has_version(parent):
910
ghosts.append(parent)
888
if self.has_version(parent):
912
889
present_parents.append(parent)
914
if delta and not len(present_parents):
891
# can only compress against the left most present parent.
893
(len(present_parents) == 0 or
894
present_parents[0] != parents[0])):
917
897
digest = sha_strings(lines)
898
if nostore_sha == digest:
899
raise errors.ExistingContent
918
900
text_length = sum(map(len, lines))
921
903
if lines[-1][-1] != '\n':
904
# copy the contents of lines.
922
906
options.append('no-eol')
923
907
lines[-1] = lines[-1] + '\n'
925
if len(present_parents) and delta:
926
910
# To speed the extract of texts the delta chain is limited
927
911
# to a fixed number of deltas. This should minimize both
928
912
# I/O and the time spend applying deltas.
929
913
delta = self._check_should_delta(present_parents)
931
915
assert isinstance(version_id, str)
932
lines = self.factory.make(lines, version_id)
916
content = self.factory.make(lines, version_id)
933
917
if delta or (self.factory.annotated and len(present_parents) > 0):
934
# Merge annotations from parent texts if so is needed.
935
delta_hunks = self._merge_annotations(lines, present_parents,
918
# Merge annotations from parent texts if needed.
919
delta_hunks = self._merge_annotations(content, present_parents,
936
920
parent_texts, delta, self.factory.annotated,
937
921
left_matching_blocks)
1035
1019
elif method == 'line-delta':
1036
1020
delta = self.factory.parse_line_delta(data, version_id)
1037
1021
content = content.copy()
1038
content._lines = self._apply_delta(content._lines,
1022
content._lines = self._apply_delta(content._lines,
1040
1024
content_map[component_id] = content
1042
1026
if 'no-eol' in self._index.get_options(version_id):
1043
1027
content = content.copy()
1044
line = content._lines[-1][1].rstrip('\n')
1045
content._lines[-1] = (content._lines[-1][0], line)
1028
content.strip_last_line_newline()
1046
1029
final_content[version_id] = content
1048
1031
# digest here is the digest from the last applied component.
1049
1032
text = content.text()
1050
1033
if sha_strings(text) != digest:
1051
raise KnitCorrupt(self.filename,
1034
raise KnitCorrupt(self.filename,
1052
1035
'sha-1 does not match %s' % version_id)
1054
text_map[version_id] = text
1055
return text_map, final_content
1037
text_map[version_id] = text
1038
return text_map, final_content
1057
1040
def iter_lines_added_or_present_in_versions(self, version_ids=None,
2395
2378
InterVersionedFile.register_optimiser(WeaveToKnit)
2398
class KnitSequenceMatcher(difflib.SequenceMatcher):
2399
"""Knit tuned sequence matcher.
2401
This is based on profiling of difflib which indicated some improvements
2402
for our usage pattern.
2405
def find_longest_match(self, alo, ahi, blo, bhi):
2406
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2408
If isjunk is not defined:
2410
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
2411
alo <= i <= i+k <= ahi
2412
blo <= j <= j+k <= bhi
2413
and for all (i',j',k') meeting those conditions,
2416
and if i == i', j <= j'
2418
In other words, of all maximal matching blocks, return one that
2419
starts earliest in a, and of all those maximal matching blocks that
2420
start earliest in a, return the one that starts earliest in b.
2422
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
2423
>>> s.find_longest_match(0, 5, 0, 9)
2426
If isjunk is defined, first the longest matching block is
2427
determined as above, but with the additional restriction that no
2428
junk element appears in the block. Then that block is extended as
2429
far as possible by matching (only) junk elements on both sides. So
2430
the resulting block never matches on junk except as identical junk
2431
happens to be adjacent to an "interesting" match.
2433
Here's the same example as before, but considering blanks to be
2434
junk. That prevents " abcd" from matching the " abcd" at the tail
2435
end of the second sequence directly. Instead only the "abcd" can
2436
match, and matches the leftmost "abcd" in the second sequence:
2438
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
2439
>>> s.find_longest_match(0, 5, 0, 9)
2442
If no blocks match, return (alo, blo, 0).
2444
>>> s = SequenceMatcher(None, "ab", "c")
2445
>>> s.find_longest_match(0, 2, 0, 1)
2449
# CAUTION: stripping common prefix or suffix would be incorrect.
2453
# Longest matching block is "ab", but if common prefix is
2454
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
2455
# strip, so ends up claiming that ab is changed to acab by
2456
# inserting "ca" in the middle. That's minimal but unintuitive:
2457
# "it's obvious" that someone inserted "ac" at the front.
2458
# Windiff ends up at the same place as diff, but by pairing up
2459
# the unique 'b's and then matching the first two 'a's.
2461
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
2462
besti, bestj, bestsize = alo, blo, 0
2463
# find longest junk-free match
2464
# during an iteration of the loop, j2len[j] = length of longest
2465
# junk-free match ending with a[i-1] and b[j]
2469
for i in xrange(alo, ahi):
2470
# look at all instances of a[i] in b; note that because
2471
# b2j has no junk keys, the loop is skipped if a[i] is junk
2472
j2lenget = j2len.get
2475
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
2476
# following improvement
2477
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
2478
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
2479
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
2481
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
2482
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
2483
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
2495
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
2497
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
2500
# Extend the best by non-junk elements on each end. In particular,
2501
# "popular" non-junk elements aren't in b2j, which greatly speeds
2502
# the inner loop above, but also means "the best" match so far
2503
# doesn't contain any junk *or* popular non-junk elements.
2504
while besti > alo and bestj > blo and \
2505
not isbjunk(b[bestj-1]) and \
2506
a[besti-1] == b[bestj-1]:
2507
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2508
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2509
not isbjunk(b[bestj+bestsize]) and \
2510
a[besti+bestsize] == b[bestj+bestsize]:
2513
# Now that we have a wholly interesting match (albeit possibly
2514
# empty!), we may as well suck up the matching junk on each
2515
# side of it too. Can't think of a good reason not to, and it
2516
# saves post-processing the (possibly considerable) expense of
2517
# figuring out what to do with it. In the case of an empty
2518
# interesting match, this is clearly the right thing to do,
2519
# because no other kind of match is possible in the regions.
2520
while besti > alo and bestj > blo and \
2521
isbjunk(b[bestj-1]) and \
2522
a[besti-1] == b[bestj-1]:
2523
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2524
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2525
isbjunk(b[bestj+bestsize]) and \
2526
a[besti+bestsize] == b[bestj+bestsize]:
2527
bestsize = bestsize + 1
2529
return besti, bestj, bestsize
2381
# Deprecated, use PatienceSequenceMatcher instead
2382
KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher
2532
2385
def annotate_knit(knit, revision_id):