1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
5
# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
"""Knit versionedfile implementation.
23
A knit is a versioned file implementation that supports efficient append only
27
lifeless: the data file is made up of "delta records". each delta record has a delta header
28
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
29
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
30
end-marker; simply "end VERSION"
32
delta can be line or full contents.a
33
... the 8's there are the index number of the annotation.
34
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
38
8 e.set('executable', 'yes')
40
8 if elt.get('executable') == 'yes':
41
8 ie.executable = True
42
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
46
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
47
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
48
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
49
09:33 < lifeless> right
50
09:33 < jrydberg> lifeless: the position and size is the range in the data file
53
so the index sequence is the dictionary compressed sequence number used
54
in the deltas to provide line annotation
59
# 10:16 < lifeless> make partial index writes safe
60
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
61
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
63
# move sha1 out of the content so that join is faster at verifying parents
64
# record content length ?
68
from cStringIO import StringIO
70
from itertools import izip, chain
76
import bzrlib.errors as errors
77
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
78
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
79
RevisionNotPresent, RevisionAlreadyPresent
80
from bzrlib.tuned_gzip import GzipFile
81
from bzrlib.trace import mutter
82
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
84
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
85
from bzrlib.tsort import topo_sort
89
# TODO: Split out code specific to this format into an associated object.
91
# TODO: Can we put in some kind of value to check that the index and data
92
# files belong together?
94
# TODO: accommodate binaries, perhaps by storing a byte count
96
# TODO: function to check whole file
98
# TODO: atomically append data, then measure backwards from the cursor
99
# position after writing to work out where it was located. we may need to
100
# bypass python file buffering.
102
DATA_SUFFIX = '.knit'
103
INDEX_SUFFIX = '.kndx'
106
class KnitContent(object):
107
"""Content of a knit version to which deltas can be applied."""
109
def __init__(self, lines):
112
def annotate_iter(self):
113
"""Yield tuples of (origin, text) for each content line."""
114
for origin, text in self._lines:
118
"""Return a list of (origin, text) tuples."""
119
return list(self.annotate_iter())
121
def line_delta_iter(self, new_lines):
122
"""Generate line-based delta from this content to new_lines."""
123
new_texts = [text for origin, text in new_lines._lines]
124
old_texts = [text for origin, text in self._lines]
125
s = KnitSequenceMatcher(None, old_texts, new_texts)
126
for op in s.get_opcodes():
129
# ofrom oto length data
130
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
132
def line_delta(self, new_lines):
133
return list(self.line_delta_iter(new_lines))
136
return [text for origin, text in self._lines]
139
return KnitContent(self._lines[:])
142
class _KnitFactory(object):
143
"""Base factory for creating content objects."""
145
def make(self, lines, version):
146
num_lines = len(lines)
147
return KnitContent(zip([version] * num_lines, lines))
150
class KnitAnnotateFactory(_KnitFactory):
151
"""Factory for creating annotated Content objects."""
155
def parse_fulltext(self, content, version):
156
"""Convert fulltext to internal representation
158
fulltext content is of the format
159
revid(utf8) plaintext\n
160
internal representation is of the format:
165
origin, text = line.split(' ', 1)
166
lines.append((origin.decode('utf-8'), text))
167
return KnitContent(lines)
169
def parse_line_delta_iter(self, lines):
170
for result_item in self.parse_line_delta[lines]:
173
def parse_line_delta(self, lines, version):
174
"""Convert a line based delta into internal representation.
176
line delta is in the form of:
177
intstart intend intcount
179
revid(utf8) newline\n
180
internal representation is
181
(start, end, count, [1..count tuples (revid, newline)])
186
# walk through the lines parsing.
188
start, end, count = [int(n) for n in header.split(',')]
192
origin, text = next().split(' ', 1)
194
contents.append((origin.decode('utf-8'), text))
195
result.append((start, end, count, contents))
198
def lower_fulltext(self, content):
199
"""convert a fulltext content record into a serializable form.
201
see parse_fulltext which this inverts.
203
return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]
205
def lower_line_delta(self, delta):
206
"""convert a delta into a serializable form.
208
See parse_line_delta which this inverts.
211
for start, end, c, lines in delta:
212
out.append('%d,%d,%d\n' % (start, end, c))
213
for origin, text in lines:
214
out.append('%s %s' % (origin.encode('utf-8'), text))
218
class KnitPlainFactory(_KnitFactory):
219
"""Factory for creating plain Content objects."""
223
def parse_fulltext(self, content, version):
224
"""This parses an unannotated fulltext.
226
Note that this is not a noop - the internal representation
227
has (versionid, line) - its just a constant versionid.
229
return self.make(content, version)
231
def parse_line_delta_iter(self, lines, version):
233
header = lines.pop(0)
234
start, end, c = [int(n) for n in header.split(',')]
235
yield start, end, c, zip([version] * c, lines[:c])
238
def parse_line_delta(self, lines, version):
239
return list(self.parse_line_delta_iter(lines, version))
241
def lower_fulltext(self, content):
242
return content.text()
244
def lower_line_delta(self, delta):
246
for start, end, c, lines in delta:
247
out.append('%d,%d,%d\n' % (start, end, c))
248
out.extend([text for origin, text in lines])
252
def make_empty_knit(transport, relpath):
253
"""Construct a empty knit at the specified location."""
254
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
258
class KnitVersionedFile(VersionedFile):
259
"""Weave-like structure with faster random access.
261
A knit stores a number of texts and a summary of the relationships
262
between them. Texts are identified by a string version-id. Texts
263
are normally stored and retrieved as a series of lines, but can
264
also be passed as single strings.
266
Lines are stored with the trailing newline (if any) included, to
267
avoid special cases for files with no final newline. Lines are
268
composed of 8-bit characters, not unicode. The combination of
269
these approaches should mean any 'binary' file can be safely
270
stored and retrieved.
273
def __init__(self, relpath, transport, file_mode=None, access_mode=None,
274
factory=None, basis_knit=None, delta=True, create=False):
275
"""Construct a knit at location specified by relpath.
277
:param create: If not True, only open an existing knit.
279
if access_mode is None:
281
super(KnitVersionedFile, self).__init__(access_mode)
282
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
283
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
286
self.transport = transport
287
self.filename = relpath
288
self.basis_knit = basis_knit
289
self.factory = factory or KnitAnnotateFactory()
290
self.writable = (access_mode == 'w')
293
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
294
access_mode, create=create, file_mode=file_mode)
295
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
296
access_mode, create=create and not len(self), file_mode=file_mode)
299
return '%s(%s)' % (self.__class__.__name__,
300
self.transport.abspath(self.filename))
302
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
303
"""See VersionedFile._add_delta()."""
304
self._check_add(version_id, []) # should we check the lines ?
305
self._check_versions_present(parents)
309
for parent in parents:
310
if not self.has_version(parent):
311
ghosts.append(parent)
313
present_parents.append(parent)
315
if delta_parent is None:
316
# reconstitute as full text.
317
assert len(delta) == 1 or len(delta) == 0
319
assert delta[0][0] == 0
320
assert delta[0][1] == 0, delta[0][1]
321
return super(KnitVersionedFile, self)._add_delta(version_id,
332
options.append('no-eol')
334
if delta_parent is not None:
335
# determine the current delta chain length.
336
# To speed the extract of texts the delta chain is limited
337
# to a fixed number of deltas. This should minimize both
338
# I/O and the time spend applying deltas.
340
delta_parents = [delta_parent]
342
parent = delta_parents[0]
343
method = self._index.get_method(parent)
344
if method == 'fulltext':
346
delta_parents = self._index.get_parents(parent)
348
if method == 'line-delta':
349
# did not find a fulltext in the delta limit.
350
# just do a normal insertion.
351
return super(KnitVersionedFile, self)._add_delta(version_id,
358
options.append('line-delta')
359
store_lines = self.factory.lower_line_delta(delta)
361
where, size = self._data.add_record(version_id, digest, store_lines)
362
self._index.add_version(version_id, options, where, size, parents)
364
def _add_raw_records(self, records, data):
365
"""Add all the records 'records' with data pre-joined in 'data'.
367
:param records: A list of tuples(version_id, options, parents, size).
368
:param data: The data for the records. When it is written, the records
369
are adjusted to have pos pointing into data by the sum of
370
the preceding records sizes.
373
pos = self._data.add_raw_record(data)
375
for (version_id, options, parents, size) in records:
376
index_entries.append((version_id, options, pos, size, parents))
378
self._index.add_versions(index_entries)
380
def clear_cache(self):
381
"""Clear the data cache only."""
382
self._data.clear_cache()
384
def copy_to(self, name, transport):
385
"""See VersionedFile.copy_to()."""
386
# copy the current index to a temp index to avoid racing with local
388
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)
390
transport.put(name + DATA_SUFFIX, self._data._open_file())
391
# rename the copied index into place
392
transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
394
def create_empty(self, name, transport, mode=None):
395
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
397
def _fix_parents(self, version, new_parents):
398
"""Fix the parents list for version.
400
This is done by appending a new version to the index
401
with identical data except for the parents list.
402
the parents list must be a superset of the current
405
current_values = self._index._cache[version]
406
assert set(current_values[4]).difference(set(new_parents)) == set()
407
self._index.add_version(version,
413
def get_delta(self, version_id):
414
"""Get a delta for constructing version from some other version."""
415
if not self.has_version(version_id):
416
raise RevisionNotPresent(version_id, self.filename)
418
parents = self.get_parents(version_id)
423
data_pos, data_size = self._index.get_position(version_id)
424
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
425
version_idx = self._index.lookup(version_id)
426
noeol = 'no-eol' in self._index.get_options(version_id)
427
if 'fulltext' == self._index.get_method(version_id):
428
new_content = self.factory.parse_fulltext(data, version_idx)
429
if parent is not None:
430
reference_content = self._get_content(parent)
431
old_texts = reference_content.text()
434
new_texts = new_content.text()
435
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
436
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
438
delta = self.factory.parse_line_delta(data, version_idx)
439
return parent, sha1, noeol, delta
441
def get_graph_with_ghosts(self):
442
"""See VersionedFile.get_graph_with_ghosts()."""
443
graph_items = self._index.get_graph()
444
return dict(graph_items)
446
def get_sha1(self, version_id):
447
"""See VersionedFile.get_sha1()."""
448
record_map = self._get_record_map([version_id])
449
method, content, digest, next = record_map[version_id]
454
"""See VersionedFile.get_suffixes()."""
455
return [DATA_SUFFIX, INDEX_SUFFIX]
457
def has_ghost(self, version_id):
458
"""True if there is a ghost reference in the file to version_id."""
460
if self.has_version(version_id):
462
# optimisable if needed by memoising the _ghosts set.
463
items = self._index.get_graph()
464
for node, parents in items:
465
for parent in parents:
466
if parent not in self._index._cache:
467
if parent == version_id:
472
"""See VersionedFile.versions."""
473
return self._index.get_versions()
475
def has_version(self, version_id):
476
"""See VersionedFile.has_version."""
477
return self._index.has_version(version_id)
479
__contains__ = has_version
481
def _merge_annotations(self, content, parents, parent_texts={},
482
delta=None, annotated=None):
483
"""Merge annotations for content. This is done by comparing
484
the annotations based on changed to the text.
488
for parent_id in parents:
489
merge_content = self._get_content(parent_id, parent_texts)
490
seq = KnitSequenceMatcher(None, merge_content.text(), content.text())
491
if delta_seq is None:
492
# setup a delta seq to reuse.
494
for i, j, n in seq.get_matching_blocks():
497
# this appears to copy (origin, text) pairs across to the new
498
# content for any line that matches the last-checked parent.
499
# FIXME: save the sequence control data for delta compression
500
# against the most relevant parent rather than rediffing.
501
content._lines[j:j+n] = merge_content._lines[i:i+n]
504
reference_content = self._get_content(parents[0], parent_texts)
505
new_texts = content.text()
506
old_texts = reference_content.text()
507
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
508
return self._make_line_delta(delta_seq, content)
510
def _make_line_delta(self, delta_seq, new_content):
511
"""Generate a line delta from delta_seq and new_content."""
513
for op in delta_seq.get_opcodes():
516
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
519
def _get_components_positions(self, version_ids):
520
"""Produce a map of position data for the components of versions.
522
This data is intended to be used for retrieving the knit records.
524
A dict of version_id to (method, data_pos, data_size, next) is
526
method is the way referenced data should be applied.
527
data_pos is the position of the data in the knit.
528
data_size is the size of the data in the knit.
529
next is the build-parent of the version, or None for fulltexts.
532
for version_id in version_ids:
535
while cursor is not None and cursor not in component_data:
537
method = picked_knit._index.get_method(cursor)
538
if method == 'fulltext':
541
next = picked_knit.get_parents(cursor)[0]
542
data_pos, data_size = self._index.get_position(cursor)
543
component_data[cursor] = (method, data_pos, data_size, next)
545
return component_data
547
def _get_content(self, version_id, parent_texts={}):
548
"""Returns a content object that makes up the specified
550
if not self.has_version(version_id):
551
raise RevisionNotPresent(version_id, self.filename)
553
cached_version = parent_texts.get(version_id, None)
554
if cached_version is not None:
555
return cached_version
557
if self.basis_knit and version_id in self.basis_knit:
558
return self.basis_knit._get_content(version_id)
560
text_map, contents_map = self._get_content_maps([version_id])
561
return contents_map[version_id]
563
def _check_versions_present(self, version_ids):
564
"""Check that all specified versions are present."""
565
version_ids = set(version_ids)
566
for r in list(version_ids):
567
if self._index.has_version(r):
568
version_ids.remove(r)
570
raise RevisionNotPresent(list(version_ids)[0], self.filename)
572
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
573
"""See VersionedFile.add_lines_with_ghosts()."""
574
self._check_add(version_id, lines)
575
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
577
def _add_lines(self, version_id, parents, lines, parent_texts):
578
"""See VersionedFile.add_lines."""
579
self._check_add(version_id, lines)
580
self._check_versions_present(parents)
581
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
583
def _check_add(self, version_id, lines):
584
"""check that version_id and lines are safe to add."""
585
assert self.writable, "knit is not opened for write"
586
### FIXME escape. RBC 20060228
587
if contains_whitespace(version_id):
588
raise InvalidRevisionId(version_id, self.filename)
589
if self.has_version(version_id):
590
raise RevisionAlreadyPresent(version_id, self.filename)
591
self._check_lines_not_unicode(lines)
592
self._check_lines_are_lines(lines)
594
def _add(self, version_id, lines, parents, delta, parent_texts):
595
"""Add a set of lines on top of version specified by parents.
597
If delta is true, compress the text as a line-delta against
600
Any versions not present will be converted into ghosts.
602
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
603
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
604
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
605
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
606
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
607
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
608
# +1383 0 8.0370 8.0370 +<len>
609
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
610
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
611
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
612
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
616
if parent_texts is None:
618
for parent in parents:
619
if not self.has_version(parent):
620
ghosts.append(parent)
622
present_parents.append(parent)
624
if delta and not len(present_parents):
627
digest = sha_strings(lines)
630
if lines[-1][-1] != '\n':
631
options.append('no-eol')
632
lines[-1] = lines[-1] + '\n'
634
if len(present_parents) and delta:
635
# To speed the extract of texts the delta chain is limited
636
# to a fixed number of deltas. This should minimize both
637
# I/O and the time spend applying deltas.
639
delta_parents = present_parents
641
parent = delta_parents[0]
642
method = self._index.get_method(parent)
643
if method == 'fulltext':
645
delta_parents = self._index.get_parents(parent)
647
if method == 'line-delta':
650
lines = self.factory.make(lines, version_id)
651
if delta or (self.factory.annotated and len(present_parents) > 0):
652
# Merge annotations from parent texts if so is needed.
653
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
654
delta, self.factory.annotated)
657
options.append('line-delta')
658
store_lines = self.factory.lower_line_delta(delta_hunks)
660
options.append('fulltext')
661
store_lines = self.factory.lower_fulltext(lines)
663
where, size = self._data.add_record(version_id, digest, store_lines)
664
self._index.add_version(version_id, options, where, size, parents)
667
def check(self, progress_bar=None):
668
"""See VersionedFile.check()."""
670
def _clone_text(self, new_version_id, old_version_id, parents):
671
"""See VersionedFile.clone_text()."""
672
# FIXME RBC 20060228 make fast by only inserting an index with null
674
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
676
def get_lines(self, version_id):
677
"""See VersionedFile.get_lines()."""
678
return self.get_line_list([version_id])[0]
680
def _get_record_map(self, version_ids):
681
"""Produce a dictionary of knit records.
683
The keys are version_ids, the values are tuples of (method, content,
685
method is the way the content should be applied.
686
content is a KnitContent object.
687
digest is the SHA1 digest of this version id after all steps are done
688
next is the build-parent of the version, i.e. the leftmost ancestor.
689
If the method is fulltext, next will be None.
691
position_map = self._get_components_positions(version_ids)
692
# c = component_id, m = method, p = position, s = size, n = next
693
records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
695
for component_id, content, digest in\
696
self._data.read_records_iter(records):
697
method, position, size, next = position_map[component_id]
698
record_map[component_id] = method, content, digest, next
702
def get_text(self, version_id):
703
"""See VersionedFile.get_text"""
704
return self.get_texts([version_id])[0]
706
def get_texts(self, version_ids):
707
return [''.join(l) for l in self.get_line_list(version_ids)]
709
def get_line_list(self, version_ids):
710
"""Return the texts of listed versions as a list of strings."""
711
text_map, content_map = self._get_content_maps(version_ids)
712
return [text_map[v] for v in version_ids]
714
def _get_content_maps(self, version_ids):
715
"""Produce maps of text and KnitContents
717
:return: (text_map, content_map) where text_map contains the texts for
718
the requested versions and content_map contains the KnitContents.
719
Both dicts take version_ids as their keys.
721
for version_id in version_ids:
722
if not self.has_version(version_id):
723
raise RevisionNotPresent(version_id, self.filename)
724
record_map = self._get_record_map(version_ids)
729
for version_id in version_ids:
732
while cursor is not None:
733
method, data, digest, next = record_map[cursor]
734
components.append((cursor, method, data, digest))
735
if cursor in content_map:
740
for component_id, method, data, digest in reversed(components):
741
if component_id in content_map:
742
content = content_map[component_id]
744
version_idx = self._index.lookup(component_id)
745
if method == 'fulltext':
746
assert content is None
747
content = self.factory.parse_fulltext(data, version_idx)
748
elif method == 'line-delta':
749
delta = self.factory.parse_line_delta(data[:],
751
content = content.copy()
752
content._lines = self._apply_delta(content._lines,
754
content_map[component_id] = content
756
if 'no-eol' in self._index.get_options(version_id):
757
content = content.copy()
758
line = content._lines[-1][1].rstrip('\n')
759
content._lines[-1] = (content._lines[-1][0], line)
760
final_content[version_id] = content
762
# digest here is the digest from the last applied component.
763
text = content.text()
764
if sha_strings(text) != digest:
765
raise KnitCorrupt(self.filename,
766
'sha-1 does not match %s' % version_id)
768
text_map[version_id] = text
769
return text_map, final_content
771
def iter_lines_added_or_present_in_versions(self, version_ids=None):
772
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
773
if version_ids is None:
774
version_ids = self.versions()
775
# we don't care about inclusions, the caller cares.
776
# but we need to setup a list of records to visit.
777
# we need version_id, position, length
778
version_id_records = []
779
requested_versions = list(version_ids)
780
# filter for available versions
781
for version_id in requested_versions:
782
if not self.has_version(version_id):
783
raise RevisionNotPresent(version_id, self.filename)
784
# get a in-component-order queue:
786
for version_id in self.versions():
787
if version_id in requested_versions:
788
version_ids.append(version_id)
789
data_pos, length = self._index.get_position(version_id)
790
version_id_records.append((version_id, data_pos, length))
792
pb = bzrlib.ui.ui_factory.nested_progress_bar()
794
total = len(version_id_records)
796
pb.update('Walking content.', count, total)
797
for version_id, data, sha_value in \
798
self._data.read_records_iter(version_id_records):
799
pb.update('Walking content.', count, total)
800
method = self._index.get_method(version_id)
801
version_idx = self._index.lookup(version_id)
802
assert method in ('fulltext', 'line-delta')
803
if method == 'fulltext':
804
content = self.factory.parse_fulltext(data, version_idx)
805
for line in content.text():
808
delta = self.factory.parse_line_delta(data, version_idx)
809
for start, end, count, lines in delta:
810
for origin, line in lines:
813
pb.update('Walking content.', total, total)
816
pb.update('Walking content.', total, total)
820
def num_versions(self):
821
"""See VersionedFile.num_versions()."""
822
return self._index.num_versions()
824
__len__ = num_versions
826
def annotate_iter(self, version_id):
827
"""See VersionedFile.annotate_iter."""
828
content = self._get_content(version_id)
829
for origin, text in content.annotate_iter():
832
def get_parents(self, version_id):
833
"""See VersionedFile.get_parents."""
836
# 52554 calls in 1264 872 internal down from 3674
838
return self._index.get_parents(version_id)
840
raise RevisionNotPresent(version_id, self.filename)
842
def get_parents_with_ghosts(self, version_id):
843
"""See VersionedFile.get_parents."""
845
return self._index.get_parents_with_ghosts(version_id)
847
raise RevisionNotPresent(version_id, self.filename)
849
def get_ancestry(self, versions):
850
"""See VersionedFile.get_ancestry."""
851
if isinstance(versions, basestring):
852
versions = [versions]
855
self._check_versions_present(versions)
856
return self._index.get_ancestry(versions)
858
def get_ancestry_with_ghosts(self, versions):
859
"""See VersionedFile.get_ancestry_with_ghosts."""
860
if isinstance(versions, basestring):
861
versions = [versions]
864
self._check_versions_present(versions)
865
return self._index.get_ancestry_with_ghosts(versions)
867
#@deprecated_method(zero_eight)
868
def walk(self, version_ids):
869
"""See VersionedFile.walk."""
870
# We take the short path here, and extract all relevant texts
871
# and put them in a weave and let that do all the work. Far
872
# from optimal, but is much simpler.
873
# FIXME RB 20060228 this really is inefficient!
874
from bzrlib.weave import Weave
876
w = Weave(self.filename)
877
ancestry = self.get_ancestry(version_ids)
878
sorted_graph = topo_sort(self._index.get_graph())
879
version_list = [vid for vid in sorted_graph if vid in ancestry]
881
for version_id in version_list:
882
lines = self.get_lines(version_id)
883
w.add_lines(version_id, self.get_parents(version_id), lines)
885
for lineno, insert_id, dset, line in w.walk(version_ids):
886
yield lineno, insert_id, dset, line
888
def plan_merge(self, ver_a, ver_b):
889
"""See VersionedFile.plan_merge."""
890
ancestors_b = set(self.get_ancestry(ver_b))
891
def status_a(revision, text):
892
if revision in ancestors_b:
893
return 'killed-b', text
897
ancestors_a = set(self.get_ancestry(ver_a))
898
def status_b(revision, text):
899
if revision in ancestors_a:
900
return 'killed-a', text
904
annotated_a = self.annotate(ver_a)
905
annotated_b = self.annotate(ver_b)
906
plain_a = [t for (a, t) in annotated_a]
907
plain_b = [t for (a, t) in annotated_b]
908
blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
911
for ai, bi, l in blocks:
912
# process all mismatched sections
913
# (last mismatched section is handled because blocks always
914
# includes a 0-length last block)
915
for revision, text in annotated_a[a_cur:ai]:
916
yield status_a(revision, text)
917
for revision, text in annotated_b[b_cur:bi]:
918
yield status_b(revision, text)
920
# and now the matched section
923
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
924
assert text_a == text_b
925
yield "unchanged", text_a
928
class _KnitComponentFile(object):
929
"""One of the files used to implement a knit database"""
931
def __init__(self, transport, filename, mode, file_mode=None):
932
self._transport = transport
933
self._filename = filename
935
self._file_mode=file_mode
937
def write_header(self):
938
if self._transport.append(self._filename, StringIO(self.HEADER),
939
mode=self._file_mode):
940
raise KnitCorrupt(self._filename, 'misaligned after writing header')
942
def check_header(self, fp):
944
if line != self.HEADER:
945
raise KnitHeaderError(badline=line)
948
"""Commit is a nop."""
951
return '%s(%s)' % (self.__class__.__name__, self._filename)
954
class _KnitIndex(_KnitComponentFile):
955
"""Manages knit index file.
957
The index is already kept in memory and read on startup, to enable
958
fast lookups of revision information. The cursor of the index
959
file is always pointing to the end, making it easy to append
962
_cache is a cache for fast mapping from version id to a Index
965
_history is a cache for fast mapping from indexes to version ids.
967
The index data format is dictionary compressed when it comes to
968
parent references; a index entry may only have parents that with a
969
lover index number. As a result, the index is topological sorted.
971
Duplicate entries may be written to the index for a single version id
972
if this is done then the latter one completely replaces the former:
973
this allows updates to correct version and parent information.
974
Note that the two entries may share the delta, and that successive
975
annotations and references MUST point to the first entry.
977
The index file on disc contains a header, followed by one line per knit
978
record. The same revision can be present in an index file more than once.
979
The first occurrence gets assigned a sequence number starting from 0.
981
The format of a single line is
982
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
983
REVISION_ID is a utf8-encoded revision id
984
FLAGS is a comma separated list of flags about the record. Values include
985
no-eol, line-delta, fulltext.
986
BYTE_OFFSET is the ascii representation of the byte offset in the data file
987
that the the compressed data starts at.
988
LENGTH is the ascii representation of the length of the data file.
989
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
991
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
992
revision id already in the knit that is a parent of REVISION_ID.
993
The ' :' marker is the end of record marker.
996
when a write is interrupted to the index file, it will result in a line that
997
does not end in ' :'. If the ' :' is not present at the end of a line, or at
998
the end of the file, then the record that is missing it will be ignored by
1001
When writing new records to the index file, the data is preceded by '\n'
1002
to ensure that records always start on new lines even if the last write was
1003
interrupted. As a result its normal for the last line in the index to be
1004
missing a trailing newline. One can be added with no harmful effects.
1007
HEADER = "# bzr knit index 8\n"
1009
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
1010
# __slots__ = ['_cache', '_history', '_transport', '_filename']
1012
def _cache_version(self, version_id, options, pos, size, parents):
1013
"""Cache a version record in the history array and index cache.
1015
This is inlined into __init__ for performance. KEEP IN SYNC.
1016
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
1019
# only want the _history index to reference the 1st index entry
1021
if version_id not in self._cache:
1022
index = len(self._history)
1023
self._history.append(version_id)
1025
index = self._cache[version_id][5]
1026
self._cache[version_id] = (version_id,
1033
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1034
_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)
1036
# position in _history is the 'official' index for a revision
1037
# but the values may have come from a newer entry.
1038
# so - wc -l of a knit index is != the number of unique names
1041
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1046
pb.update('read knit index', count, total)
1047
fp = self._transport.get(self._filename)
1048
self.check_header(fp)
1049
# readlines reads the whole file at once:
1050
# bad for transports like http, good for local disk
1051
# we save 60 ms doing this one change (
1052
# from calling readline each time to calling
1054
# probably what we want for nice behaviour on
1055
# http is a incremental readlines that yields, or
1056
# a check for local vs non local indexes,
1057
for l in fp.readlines():
1059
if len(rec) < 5 or rec[-1] != ':':
1061
# FIXME: in the future we should determine if its a
1062
# short write - and ignore it
1063
# or a different failure, and raise. RBC 20060407
1067
#pb.update('read knit index', count, total)
1068
# See self._parse_parents
1070
for value in rec[4:-1]:
1072
# uncompressed reference
1073
parents.append(value[1:])
1075
# this is 15/4000ms faster than isinstance,
1077
# this function is called thousands of times a
1078
# second so small variations add up.
1079
assert value.__class__ is str
1080
parents.append(self._history[int(value)])
1081
# end self._parse_parents
1082
# self._cache_version(rec[0],
1083
# rec[1].split(','),
1087
# --- self._cache_version
1088
# only want the _history index to reference the 1st
1089
# index entry for version_id
1091
if version_id not in self._cache:
1092
index = len(self._history)
1093
self._history.append(version_id)
1095
index = self._cache[version_id][5]
1096
self._cache[version_id] = (version_id,
1102
# --- self._cache_version
1103
except NoSuchFile, e:
1104
if mode != 'w' or not create:
1108
pb.update('read knit index', total, total)
1111
def _parse_parents(self, compressed_parents):
1112
"""convert a list of string parent values into version ids.
1114
ints are looked up in the index.
1115
.FOO values are ghosts and converted in to FOO.
1117
NOTE: the function is retained here for clarity, and for possible
1118
use in partial index reads. However bulk processing now has
1119
it inlined in __init__ for inner-loop optimisation.
1122
for value in compressed_parents:
1123
if value[-1] == '.':
1124
# uncompressed reference
1125
result.append(value[1:])
1127
# this is 15/4000ms faster than isinstance,
1128
# this function is called thousands of times a
1129
# second so small variations add up.
1130
assert value.__class__ is str
1131
result.append(self._history[int(value)])
1134
def get_graph(self):
1136
for version_id, index in self._cache.iteritems():
1137
graph.append((version_id, index[4]))
1140
def get_ancestry(self, versions):
1141
"""See VersionedFile.get_ancestry."""
1142
# get a graph of all the mentioned versions:
1144
pending = set(versions)
1146
version = pending.pop()
1147
parents = self._cache[version][4]
1148
# got the parents ok
1150
parents = [parent for parent in parents if parent in self._cache]
1151
for parent in parents:
1152
# if not completed and not a ghost
1153
if parent not in graph:
1155
graph[version] = parents
1156
return topo_sort(graph.items())
1158
def get_ancestry_with_ghosts(self, versions):
1159
"""See VersionedFile.get_ancestry_with_ghosts."""
1160
# get a graph of all the mentioned versions:
1162
pending = set(versions)
1164
version = pending.pop()
1166
parents = self._cache[version][4]
1172
# got the parents ok
1173
for parent in parents:
1174
if parent not in graph:
1176
graph[version] = parents
1177
return topo_sort(graph.items())
1179
def num_versions(self):
1180
return len(self._history)
1182
__len__ = num_versions
1184
def get_versions(self):
1185
return self._history
1187
def idx_to_name(self, idx):
1188
return self._history[idx]
1190
def lookup(self, version_id):
1191
assert version_id in self._cache
1192
return self._cache[version_id][5]
1194
def _version_list_to_index(self, versions):
1196
for version in versions:
1197
if version in self._cache:
1198
# -- inlined lookup() --
1199
result_list.append(str(self._cache[version][5]))
1200
# -- end lookup () --
1202
result_list.append('.' + version.encode('utf-8'))
1203
return ' '.join(result_list)
1205
def add_version(self, version_id, options, pos, size, parents):
1206
"""Add a version record to the index."""
1207
self.add_versions(((version_id, options, pos, size, parents),))
1209
def add_versions(self, versions):
1210
"""Add multiple versions to the index.
1212
:param versions: a list of tuples:
1213
(version_id, options, pos, size, parents).
1216
for version_id, options, pos, size, parents in versions:
1217
line = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),
1221
self._version_list_to_index(parents))
1222
assert isinstance(line, str), \
1223
'content must be utf-8 encoded: %r' % (line,)
1225
self._transport.append(self._filename, StringIO(''.join(lines)))
1226
# cache after writing, so that a failed write leads to missing cache
1227
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1228
# failure, reload the index or flush it or some such, to prevent
1229
# writing records that did complete twice.
1230
for version_id, options, pos, size, parents in versions:
1231
self._cache_version(version_id, options, pos, size, parents)
1233
def has_version(self, version_id):
1234
"""True if the version is in the index."""
1235
return self._cache.has_key(version_id)
1237
def get_position(self, version_id):
1238
"""Return data position and size of specified version."""
1239
return (self._cache[version_id][2], \
1240
self._cache[version_id][3])
1242
def get_method(self, version_id):
1243
"""Return compression method of specified version."""
1244
options = self._cache[version_id][1]
1245
if 'fulltext' in options:
1248
assert 'line-delta' in options
1251
def get_options(self, version_id):
1252
return self._cache[version_id][1]
1254
def get_parents(self, version_id):
1255
"""Return parents of specified version ignoring ghosts."""
1256
return [parent for parent in self._cache[version_id][4]
1257
if parent in self._cache]
1259
def get_parents_with_ghosts(self, version_id):
1260
"""Return parents of specified version with ghosts."""
1261
return self._cache[version_id][4]
1263
def check_versions_present(self, version_ids):
1264
"""Check that all specified versions are present."""
1265
version_ids = set(version_ids)
1266
for version_id in list(version_ids):
1267
if version_id in self._cache:
1268
version_ids.remove(version_id)
1270
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1273
class _KnitData(_KnitComponentFile):
1274
"""Contents of the knit data file"""
1276
HEADER = "# bzr knit data 8\n"
1278
def __init__(self, transport, filename, mode, create=False, file_mode=None):
1279
_KnitComponentFile.__init__(self, transport, filename, mode)
1281
self._checked = False
1283
self._transport.put(self._filename, StringIO(''), mode=file_mode)
1286
def clear_cache(self):
1287
"""Clear the record cache."""
1290
def _open_file(self):
1291
if self._file is None:
1293
self._file = self._transport.get(self._filename)
1298
def _record_to_data(self, version_id, digest, lines):
1299
"""Convert version_id, digest, lines into a raw data block.
1301
:return: (len, a StringIO instance with the raw data ready to read.)
1304
data_file = GzipFile(None, mode='wb', fileobj=sio)
1305
data_file.writelines(chain(
1306
["version %s %d %s\n" % (version_id.encode('utf-8'),
1310
["end %s\n" % version_id.encode('utf-8')]))
1317
def add_raw_record(self, raw_data):
1318
"""Append a prepared record to the data file.
1320
:return: the offset in the data file raw_data was written.
1322
assert isinstance(raw_data, str), 'data must be plain bytes'
1323
return self._transport.append(self._filename, StringIO(raw_data))
1325
def add_record(self, version_id, digest, lines):
1326
"""Write new text record to disk. Returns the position in the
1327
file where it was written."""
1328
size, sio = self._record_to_data(version_id, digest, lines)
1330
self._records[version_id] = (digest, lines)
1332
start_pos = self._transport.append(self._filename, sio)
1333
return start_pos, size
1335
def _parse_record_header(self, version_id, raw_data):
1336
"""Parse a record header for consistency.
1338
:return: the header and the decompressor stream.
1339
as (stream, header_record)
1341
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1342
rec = df.readline().split()
1344
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1345
if rec[1].decode('utf-8')!= version_id:
1346
raise KnitCorrupt(self._filename,
1347
'unexpected version, wanted %r, got %r' % (
1348
version_id, rec[1]))
1351
def _parse_record(self, version_id, data):
1353
# 4168 calls in 2880 217 internal
1354
# 4168 calls to _parse_record_header in 2121
1355
# 4168 calls to readlines in 330
1356
df, rec = self._parse_record_header(version_id, data)
1357
record_contents = df.readlines()
1358
l = record_contents.pop()
1359
assert len(record_contents) == int(rec[2])
1360
if l.decode('utf-8') != 'end %s\n' % version_id:
1361
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1364
return record_contents, rec[3]
1366
def read_records_iter_raw(self, records):
1367
"""Read text records from data file and yield raw data.
1369
This unpacks enough of the text record to validate the id is
1370
as expected but thats all.
1372
It will actively recompress currently cached records on the
1373
basis that that is cheaper than I/O activity.
1376
for version_id, pos, size in records:
1377
if version_id not in self._records:
1378
needed_records.append((version_id, pos, size))
1380
# setup an iterator of the external records:
1381
# uses readv so nice and fast we hope.
1382
if len(needed_records):
1383
# grab the disk data needed.
1384
raw_records = self._transport.readv(self._filename,
1385
[(pos, size) for version_id, pos, size in needed_records])
1387
for version_id, pos, size in records:
1388
if version_id in self._records:
1389
# compress a new version
1390
size, sio = self._record_to_data(version_id,
1391
self._records[version_id][0],
1392
self._records[version_id][1])
1393
yield version_id, sio.getvalue()
1395
pos, data = raw_records.next()
1396
# validate the header
1397
df, rec = self._parse_record_header(version_id, data)
1399
yield version_id, data
1402
def read_records_iter(self, records):
1403
"""Read text records from data file and yield result.
1405
Each passed record is a tuple of (version_id, pos, len) and
1406
will be read in the given order. Yields (version_id,
1410
# 60890 calls for 4168 extractions in 5045, 683 internal.
1411
# 4168 calls to readv in 1411
1412
# 4168 calls to parse_record in 2880
1414
needed_records = set()
1415
for version_id, pos, size in records:
1416
if version_id not in self._records:
1417
needed_records.add((version_id, pos, size))
1419
# turn our set into a list, sorted by file position
1420
needed_records = sorted(needed_records, key=operator.itemgetter(1))
1422
if len(needed_records):
1423
# We take it that the transport optimizes the fetching as good
1424
# as possible (ie, reads continuous ranges.)
1425
response = self._transport.readv(self._filename,
1426
[(pos, size) for version_id, pos, size in needed_records])
1428
for (record_id, pos, size), (pos, data) in \
1429
izip(iter(needed_records), response):
1430
content, digest = self._parse_record(record_id, data)
1431
self._records[record_id] = (digest, content)
1433
for version_id, pos, size in records:
1434
yield version_id, list(self._records[version_id][1]), self._records[version_id][0]
1436
def read_records(self, records):
1437
"""Read records into a dictionary."""
1439
for record_id, content, digest in self.read_records_iter(records):
1440
components[record_id] = (content, digest)
1444
class InterKnit(InterVersionedFile):
1445
"""Optimised code paths for knit to knit operations."""
1447
_matching_file_from_factory = KnitVersionedFile
1448
_matching_file_to_factory = KnitVersionedFile
1451
def is_compatible(source, target):
1452
"""Be compatible with knits. """
1454
return (isinstance(source, KnitVersionedFile) and
1455
isinstance(target, KnitVersionedFile))
1456
except AttributeError:
1459
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1460
"""See InterVersionedFile.join."""
1461
assert isinstance(self.source, KnitVersionedFile)
1462
assert isinstance(self.target, KnitVersionedFile)
1464
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1469
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1471
version_ids = list(version_ids)
1472
if None in version_ids:
1473
version_ids.remove(None)
1475
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1476
this_versions = set(self.target._index.get_versions())
1477
needed_versions = self.source_ancestry - this_versions
1478
cross_check_versions = self.source_ancestry.intersection(this_versions)
1479
mismatched_versions = set()
1480
for version in cross_check_versions:
1481
# scan to include needed parents.
1482
n1 = set(self.target.get_parents_with_ghosts(version))
1483
n2 = set(self.source.get_parents_with_ghosts(version))
1485
# FIXME TEST this check for cycles being introduced works
1486
# the logic is we have a cycle if in our graph we are an
1487
# ancestor of any of the n2 revisions.
1493
parent_ancestors = self.source.get_ancestry(parent)
1494
if version in parent_ancestors:
1495
raise errors.GraphCycleError([parent, version])
1496
# ensure this parent will be available later.
1497
new_parents = n2.difference(n1)
1498
needed_versions.update(new_parents.difference(this_versions))
1499
mismatched_versions.add(version)
1501
if not needed_versions and not mismatched_versions:
1503
full_list = topo_sort(self.source.get_graph())
1505
version_list = [i for i in full_list if (not self.target.has_version(i)
1506
and i in needed_versions)]
1510
copy_queue_records = []
1512
for version_id in version_list:
1513
options = self.source._index.get_options(version_id)
1514
parents = self.source._index.get_parents_with_ghosts(version_id)
1515
# check that its will be a consistent copy:
1516
for parent in parents:
1517
# if source has the parent, we must :
1518
# * already have it or
1519
# * have it scheduled already
1520
# otherwise we don't care
1521
assert (self.target.has_version(parent) or
1522
parent in copy_set or
1523
not self.source.has_version(parent))
1524
data_pos, data_size = self.source._index.get_position(version_id)
1525
copy_queue_records.append((version_id, data_pos, data_size))
1526
copy_queue.append((version_id, options, parents))
1527
copy_set.add(version_id)
1529
# data suck the join:
1531
total = len(version_list)
1534
for (version_id, raw_data), \
1535
(version_id2, options, parents) in \
1536
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1538
assert version_id == version_id2, 'logic error, inconsistent results'
1540
pb.update("Joining knit", count, total)
1541
raw_records.append((version_id, options, parents, len(raw_data)))
1542
raw_datum.append(raw_data)
1543
self.target._add_raw_records(raw_records, ''.join(raw_datum))
1545
for version in mismatched_versions:
1546
# FIXME RBC 20060309 is this needed?
1547
n1 = set(self.target.get_parents_with_ghosts(version))
1548
n2 = set(self.source.get_parents_with_ghosts(version))
1549
# write a combined record to our history preserving the current
1550
# parents as first in the list
1551
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1552
self.target.fix_parents(version, new_parents)
1558
InterVersionedFile.register_optimiser(InterKnit)
1561
class WeaveToKnit(InterVersionedFile):
1562
"""Optimised code paths for weave to knit operations."""
1564
_matching_file_from_factory = bzrlib.weave.WeaveFile
1565
_matching_file_to_factory = KnitVersionedFile
1568
def is_compatible(source, target):
1569
"""Be compatible with weaves to knits."""
1571
return (isinstance(source, bzrlib.weave.Weave) and
1572
isinstance(target, KnitVersionedFile))
1573
except AttributeError:
1576
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1577
"""See InterVersionedFile.join."""
1578
assert isinstance(self.source, bzrlib.weave.Weave)
1579
assert isinstance(self.target, KnitVersionedFile)
1581
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1586
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1588
version_ids = list(version_ids)
1590
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1591
this_versions = set(self.target._index.get_versions())
1592
needed_versions = self.source_ancestry - this_versions
1593
cross_check_versions = self.source_ancestry.intersection(this_versions)
1594
mismatched_versions = set()
1595
for version in cross_check_versions:
1596
# scan to include needed parents.
1597
n1 = set(self.target.get_parents_with_ghosts(version))
1598
n2 = set(self.source.get_parents(version))
1599
# if all of n2's parents are in n1, then its fine.
1600
if n2.difference(n1):
1601
# FIXME TEST this check for cycles being introduced works
1602
# the logic is we have a cycle if in our graph we are an
1603
# ancestor of any of the n2 revisions.
1609
parent_ancestors = self.source.get_ancestry(parent)
1610
if version in parent_ancestors:
1611
raise errors.GraphCycleError([parent, version])
1612
# ensure this parent will be available later.
1613
new_parents = n2.difference(n1)
1614
needed_versions.update(new_parents.difference(this_versions))
1615
mismatched_versions.add(version)
1617
if not needed_versions and not mismatched_versions:
1619
full_list = topo_sort(self.source.get_graph())
1621
version_list = [i for i in full_list if (not self.target.has_version(i)
1622
and i in needed_versions)]
1626
total = len(version_list)
1627
for version_id in version_list:
1628
pb.update("Converting to knit", count, total)
1629
parents = self.source.get_parents(version_id)
1630
# check that its will be a consistent copy:
1631
for parent in parents:
1632
# if source has the parent, we must already have it
1633
assert (self.target.has_version(parent))
1634
self.target.add_lines(
1635
version_id, parents, self.source.get_lines(version_id))
1638
for version in mismatched_versions:
1639
# FIXME RBC 20060309 is this needed?
1640
n1 = set(self.target.get_parents_with_ghosts(version))
1641
n2 = set(self.source.get_parents(version))
1642
# write a combined record to our history preserving the current
1643
# parents as first in the list
1644
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1645
self.target.fix_parents(version, new_parents)
1651
InterVersionedFile.register_optimiser(WeaveToKnit)
1654
class KnitSequenceMatcher(difflib.SequenceMatcher):
1655
"""Knit tuned sequence matcher.
1657
This is based on profiling of difflib which indicated some improvements
1658
for our usage pattern.
1661
def find_longest_match(self, alo, ahi, blo, bhi):
1662
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1664
If isjunk is not defined:
1666
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1667
alo <= i <= i+k <= ahi
1668
blo <= j <= j+k <= bhi
1669
and for all (i',j',k') meeting those conditions,
1672
and if i == i', j <= j'
1674
In other words, of all maximal matching blocks, return one that
1675
starts earliest in a, and of all those maximal matching blocks that
1676
start earliest in a, return the one that starts earliest in b.
1678
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1679
>>> s.find_longest_match(0, 5, 0, 9)
1682
If isjunk is defined, first the longest matching block is
1683
determined as above, but with the additional restriction that no
1684
junk element appears in the block. Then that block is extended as
1685
far as possible by matching (only) junk elements on both sides. So
1686
the resulting block never matches on junk except as identical junk
1687
happens to be adjacent to an "interesting" match.
1689
Here's the same example as before, but considering blanks to be
1690
junk. That prevents " abcd" from matching the " abcd" at the tail
1691
end of the second sequence directly. Instead only the "abcd" can
1692
match, and matches the leftmost "abcd" in the second sequence:
1694
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1695
>>> s.find_longest_match(0, 5, 0, 9)
1698
If no blocks match, return (alo, blo, 0).
1700
>>> s = SequenceMatcher(None, "ab", "c")
1701
>>> s.find_longest_match(0, 2, 0, 1)
1705
# CAUTION: stripping common prefix or suffix would be incorrect.
1709
# Longest matching block is "ab", but if common prefix is
1710
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1711
# strip, so ends up claiming that ab is changed to acab by
1712
# inserting "ca" in the middle. That's minimal but unintuitive:
1713
# "it's obvious" that someone inserted "ac" at the front.
1714
# Windiff ends up at the same place as diff, but by pairing up
1715
# the unique 'b's and then matching the first two 'a's.
1717
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1718
besti, bestj, bestsize = alo, blo, 0
1719
# find longest junk-free match
1720
# during an iteration of the loop, j2len[j] = length of longest
1721
# junk-free match ending with a[i-1] and b[j]
1725
for i in xrange(alo, ahi):
1726
# look at all instances of a[i] in b; note that because
1727
# b2j has no junk keys, the loop is skipped if a[i] is junk
1728
j2lenget = j2len.get
1731
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1732
# following improvement
1733
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1734
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1735
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1737
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1738
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1739
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1751
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1753
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1756
# Extend the best by non-junk elements on each end. In particular,
1757
# "popular" non-junk elements aren't in b2j, which greatly speeds
1758
# the inner loop above, but also means "the best" match so far
1759
# doesn't contain any junk *or* popular non-junk elements.
1760
while besti > alo and bestj > blo and \
1761
not isbjunk(b[bestj-1]) and \
1762
a[besti-1] == b[bestj-1]:
1763
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1764
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1765
not isbjunk(b[bestj+bestsize]) and \
1766
a[besti+bestsize] == b[bestj+bestsize]:
1769
# Now that we have a wholly interesting match (albeit possibly
1770
# empty!), we may as well suck up the matching junk on each
1771
# side of it too. Can't think of a good reason not to, and it
1772
# saves post-processing the (possibly considerable) expense of
1773
# figuring out what to do with it. In the case of an empty
1774
# interesting match, this is clearly the right thing to do,
1775
# because no other kind of match is possible in the regions.
1776
while besti > alo and bestj > blo and \
1777
isbjunk(b[bestj-1]) and \
1778
a[besti-1] == b[bestj-1]:
1779
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1780
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1781
isbjunk(b[bestj+bestsize]) and \
1782
a[besti+bestsize] == b[bestj+bestsize]:
1783
bestsize = bestsize + 1
1785
return besti, bestj, bestsize