1
# Copyright (C) 2005, 2006 Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
5
# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
"""Knit versionedfile implementation.
23
A knit is a versioned file implementation that supports efficient append only
27
lifeless: the data file is made up of "delta records". each delta record has a delta header
28
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
29
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
30
end-marker; simply "end VERSION"
32
delta can be line or full contents.a
33
... the 8's there are the index number of the annotation.
34
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
38
8 e.set('executable', 'yes')
40
8 if elt.get('executable') == 'yes':
41
8 ie.executable = True
42
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
46
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
47
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
48
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
49
09:33 < lifeless> right
50
09:33 < jrydberg> lifeless: the position and size is the range in the data file
53
so the index sequence is the dictionary compressed sequence number used
54
in the deltas to provide line annotation
59
# 10:16 < lifeless> make partial index writes safe
60
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
61
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
63
# move sha1 out of the content so that join is faster at verifying parents
64
# record content length ?
68
from cStringIO import StringIO
70
from itertools import izip, chain
82
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
83
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
84
RevisionNotPresent, RevisionAlreadyPresent
85
from bzrlib.tuned_gzip import GzipFile
86
from bzrlib.trace import mutter
87
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
89
from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed
90
from bzrlib.tsort import topo_sort
93
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
96
# TODO: Split out code specific to this format into an associated object.
98
# TODO: Can we put in some kind of value to check that the index and data
99
# files belong together?
101
# TODO: accommodate binaries, perhaps by storing a byte count
103
# TODO: function to check whole file
105
# TODO: atomically append data, then measure backwards from the cursor
106
# position after writing to work out where it was located. we may need to
107
# bypass python file buffering.
109
DATA_SUFFIX = '.knit'
110
INDEX_SUFFIX = '.kndx'
113
class KnitContent(object):
114
"""Content of a knit version to which deltas can be applied."""
116
def __init__(self, lines):
119
def annotate_iter(self):
120
"""Yield tuples of (origin, text) for each content line."""
121
for origin, text in self._lines:
125
"""Return a list of (origin, text) tuples."""
126
return list(self.annotate_iter())
128
def line_delta_iter(self, new_lines):
129
"""Generate line-based delta from this content to new_lines."""
130
new_texts = [text for origin, text in new_lines._lines]
131
old_texts = [text for origin, text in self._lines]
132
s = KnitSequenceMatcher(None, old_texts, new_texts)
133
for op in s.get_opcodes():
136
# ofrom oto length data
137
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
139
def line_delta(self, new_lines):
140
return list(self.line_delta_iter(new_lines))
143
return [text for origin, text in self._lines]
146
return KnitContent(self._lines[:])
149
class _KnitFactory(object):
150
"""Base factory for creating content objects."""
152
def make(self, lines, version):
153
num_lines = len(lines)
154
return KnitContent(zip([version] * num_lines, lines))
157
class KnitAnnotateFactory(_KnitFactory):
158
"""Factory for creating annotated Content objects."""
162
def parse_fulltext(self, content, version):
163
"""Convert fulltext to internal representation
165
fulltext content is of the format
166
revid(utf8) plaintext\n
167
internal representation is of the format:
170
decode_utf8 = cache_utf8.decode
173
origin, text = line.split(' ', 1)
174
lines.append((decode_utf8(origin), text))
175
return KnitContent(lines)
177
def parse_line_delta_iter(self, lines):
178
for result_item in self.parse_line_delta[lines]:
181
def parse_line_delta(self, lines, version):
182
"""Convert a line based delta into internal representation.
184
line delta is in the form of:
185
intstart intend intcount
187
revid(utf8) newline\n
188
internal representation is
189
(start, end, count, [1..count tuples (revid, newline)])
191
decode_utf8 = cache_utf8.decode
195
# walk through the lines parsing.
197
start, end, count = [int(n) for n in header.split(',')]
201
origin, text = next().split(' ', 1)
203
contents.append((decode_utf8(origin), text))
204
result.append((start, end, count, contents))
207
def lower_fulltext(self, content):
208
"""convert a fulltext content record into a serializable form.
210
see parse_fulltext which this inverts.
212
encode_utf8 = cache_utf8.encode
213
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
215
def lower_line_delta(self, delta):
216
"""convert a delta into a serializable form.
218
See parse_line_delta which this inverts.
220
encode_utf8 = cache_utf8.encode
222
for start, end, c, lines in delta:
223
out.append('%d,%d,%d\n' % (start, end, c))
224
out.extend(encode_utf8(origin) + ' ' + text
225
for origin, text in lines)
229
class KnitPlainFactory(_KnitFactory):
230
"""Factory for creating plain Content objects."""
234
def parse_fulltext(self, content, version):
235
"""This parses an unannotated fulltext.
237
Note that this is not a noop - the internal representation
238
has (versionid, line) - its just a constant versionid.
240
return self.make(content, version)
242
def parse_line_delta_iter(self, lines, version):
244
header = lines.pop(0)
245
start, end, c = [int(n) for n in header.split(',')]
246
yield start, end, c, zip([version] * c, lines[:c])
249
def parse_line_delta(self, lines, version):
250
return list(self.parse_line_delta_iter(lines, version))
252
def lower_fulltext(self, content):
253
return content.text()
255
def lower_line_delta(self, delta):
257
for start, end, c, lines in delta:
258
out.append('%d,%d,%d\n' % (start, end, c))
259
out.extend([text for origin, text in lines])
263
def make_empty_knit(transport, relpath):
264
"""Construct a empty knit at the specified location."""
265
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
269
class KnitVersionedFile(VersionedFile):
270
"""Weave-like structure with faster random access.
272
A knit stores a number of texts and a summary of the relationships
273
between them. Texts are identified by a string version-id. Texts
274
are normally stored and retrieved as a series of lines, but can
275
also be passed as single strings.
277
Lines are stored with the trailing newline (if any) included, to
278
avoid special cases for files with no final newline. Lines are
279
composed of 8-bit characters, not unicode. The combination of
280
these approaches should mean any 'binary' file can be safely
281
stored and retrieved.
284
def __init__(self, relpath, transport, file_mode=None, access_mode=None,
285
factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,
286
create=False, create_parent_dir=False, delay_create=False,
288
"""Construct a knit at location specified by relpath.
290
:param create: If not True, only open an existing knit.
291
:param create_parent_dir: If True, create the parent directory if
292
creating the file fails. (This is used for stores with
293
hash-prefixes that may not exist yet)
294
:param delay_create: The calling code is aware that the knit won't
295
actually be created until the first data is stored.
297
if deprecated_passed(basis_knit):
298
warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"
299
" deprecated as of bzr 0.9.",
300
DeprecationWarning, stacklevel=2)
301
if access_mode is None:
303
super(KnitVersionedFile, self).__init__(access_mode)
304
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
305
self.transport = transport
306
self.filename = relpath
307
self.factory = factory or KnitAnnotateFactory()
308
self.writable = (access_mode == 'w')
311
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
312
access_mode, create=create, file_mode=file_mode,
313
create_parent_dir=create_parent_dir, delay_create=delay_create,
315
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
316
access_mode, create=create and not len(self), file_mode=file_mode,
317
create_parent_dir=create_parent_dir, delay_create=delay_create,
321
return '%s(%s)' % (self.__class__.__name__,
322
self.transport.abspath(self.filename))
324
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
325
"""See VersionedFile._add_delta()."""
326
self._check_add(version_id, []) # should we check the lines ?
327
self._check_versions_present(parents)
331
for parent in parents:
332
if not self.has_version(parent):
333
ghosts.append(parent)
335
present_parents.append(parent)
337
if delta_parent is None:
338
# reconstitute as full text.
339
assert len(delta) == 1 or len(delta) == 0
341
assert delta[0][0] == 0
342
assert delta[0][1] == 0, delta[0][1]
343
return super(KnitVersionedFile, self)._add_delta(version_id,
354
options.append('no-eol')
356
if delta_parent is not None:
357
# determine the current delta chain length.
358
# To speed the extract of texts the delta chain is limited
359
# to a fixed number of deltas. This should minimize both
360
# I/O and the time spend applying deltas.
362
delta_parents = [delta_parent]
364
parent = delta_parents[0]
365
method = self._index.get_method(parent)
366
if method == 'fulltext':
368
delta_parents = self._index.get_parents(parent)
370
if method == 'line-delta':
371
# did not find a fulltext in the delta limit.
372
# just do a normal insertion.
373
return super(KnitVersionedFile, self)._add_delta(version_id,
380
options.append('line-delta')
381
store_lines = self.factory.lower_line_delta(delta)
383
where, size = self._data.add_record(version_id, digest, store_lines)
384
self._index.add_version(version_id, options, where, size, parents)
386
def _add_raw_records(self, records, data):
387
"""Add all the records 'records' with data pre-joined in 'data'.
389
:param records: A list of tuples(version_id, options, parents, size).
390
:param data: The data for the records. When it is written, the records
391
are adjusted to have pos pointing into data by the sum of
392
the preceding records sizes.
395
pos = self._data.add_raw_record(data)
398
for (version_id, options, parents, size) in records:
399
index_entries.append((version_id, options, pos+offset,
401
if self._data._do_cache:
402
self._data._cache[version_id] = data[offset:offset+size]
404
self._index.add_versions(index_entries)
406
def enable_cache(self):
407
"""Start caching data for this knit"""
408
self._data.enable_cache()
410
def clear_cache(self):
411
"""Clear the data cache only."""
412
self._data.clear_cache()
414
def copy_to(self, name, transport):
415
"""See VersionedFile.copy_to()."""
416
# copy the current index to a temp index to avoid racing with local
418
transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',
419
self.transport.get(self._index._filename))
421
f = self._data._open_file()
423
transport.put_file(name + DATA_SUFFIX, f)
426
# move the copied index into place
427
transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
429
def create_empty(self, name, transport, mode=None):
430
return KnitVersionedFile(name, transport, factory=self.factory,
431
delta=self.delta, create=True)
433
def _fix_parents(self, version, new_parents):
434
"""Fix the parents list for version.
436
This is done by appending a new version to the index
437
with identical data except for the parents list.
438
the parents list must be a superset of the current
441
current_values = self._index._cache[version]
442
assert set(current_values[4]).difference(set(new_parents)) == set()
443
self._index.add_version(version,
449
def get_delta(self, version_id):
450
"""Get a delta for constructing version from some other version."""
451
if not self.has_version(version_id):
452
raise RevisionNotPresent(version_id, self.filename)
454
parents = self.get_parents(version_id)
459
data_pos, data_size = self._index.get_position(version_id)
460
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
461
version_idx = self._index.lookup(version_id)
462
noeol = 'no-eol' in self._index.get_options(version_id)
463
if 'fulltext' == self._index.get_method(version_id):
464
new_content = self.factory.parse_fulltext(data, version_idx)
465
if parent is not None:
466
reference_content = self._get_content(parent)
467
old_texts = reference_content.text()
470
new_texts = new_content.text()
471
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
472
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
474
delta = self.factory.parse_line_delta(data, version_idx)
475
return parent, sha1, noeol, delta
477
def get_graph_with_ghosts(self):
478
"""See VersionedFile.get_graph_with_ghosts()."""
479
graph_items = self._index.get_graph()
480
return dict(graph_items)
482
def get_sha1(self, version_id):
483
"""See VersionedFile.get_sha1()."""
484
record_map = self._get_record_map([version_id])
485
method, content, digest, next = record_map[version_id]
490
"""See VersionedFile.get_suffixes()."""
491
return [DATA_SUFFIX, INDEX_SUFFIX]
493
def has_ghost(self, version_id):
494
"""True if there is a ghost reference in the file to version_id."""
496
if self.has_version(version_id):
498
# optimisable if needed by memoising the _ghosts set.
499
items = self._index.get_graph()
500
for node, parents in items:
501
for parent in parents:
502
if parent not in self._index._cache:
503
if parent == version_id:
508
"""See VersionedFile.versions."""
509
return self._index.get_versions()
511
def has_version(self, version_id):
512
"""See VersionedFile.has_version."""
513
return self._index.has_version(version_id)
515
__contains__ = has_version
517
def _merge_annotations(self, content, parents, parent_texts={},
518
delta=None, annotated=None):
519
"""Merge annotations for content. This is done by comparing
520
the annotations based on changed to the text.
524
for parent_id in parents:
525
merge_content = self._get_content(parent_id, parent_texts)
526
seq = KnitSequenceMatcher(None, merge_content.text(), content.text())
527
if delta_seq is None:
528
# setup a delta seq to reuse.
530
for i, j, n in seq.get_matching_blocks():
533
# this appears to copy (origin, text) pairs across to the new
534
# content for any line that matches the last-checked parent.
535
# FIXME: save the sequence control data for delta compression
536
# against the most relevant parent rather than rediffing.
537
content._lines[j:j+n] = merge_content._lines[i:i+n]
540
reference_content = self._get_content(parents[0], parent_texts)
541
new_texts = content.text()
542
old_texts = reference_content.text()
543
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
544
return self._make_line_delta(delta_seq, content)
546
def _make_line_delta(self, delta_seq, new_content):
547
"""Generate a line delta from delta_seq and new_content."""
549
for op in delta_seq.get_opcodes():
552
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
555
def _get_components_positions(self, version_ids):
556
"""Produce a map of position data for the components of versions.
558
This data is intended to be used for retrieving the knit records.
560
A dict of version_id to (method, data_pos, data_size, next) is
562
method is the way referenced data should be applied.
563
data_pos is the position of the data in the knit.
564
data_size is the size of the data in the knit.
565
next is the build-parent of the version, or None for fulltexts.
568
for version_id in version_ids:
571
while cursor is not None and cursor not in component_data:
572
method = self._index.get_method(cursor)
573
if method == 'fulltext':
576
next = self.get_parents(cursor)[0]
577
data_pos, data_size = self._index.get_position(cursor)
578
component_data[cursor] = (method, data_pos, data_size, next)
580
return component_data
582
def _get_content(self, version_id, parent_texts={}):
583
"""Returns a content object that makes up the specified
585
if not self.has_version(version_id):
586
raise RevisionNotPresent(version_id, self.filename)
588
cached_version = parent_texts.get(version_id, None)
589
if cached_version is not None:
590
return cached_version
592
text_map, contents_map = self._get_content_maps([version_id])
593
return contents_map[version_id]
595
def _check_versions_present(self, version_ids):
596
"""Check that all specified versions are present."""
597
version_ids = set(version_ids)
598
for r in list(version_ids):
599
if self._index.has_version(r):
600
version_ids.remove(r)
602
raise RevisionNotPresent(list(version_ids)[0], self.filename)
604
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
605
"""See VersionedFile.add_lines_with_ghosts()."""
606
self._check_add(version_id, lines)
607
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
609
def _add_lines(self, version_id, parents, lines, parent_texts):
610
"""See VersionedFile.add_lines."""
611
self._check_add(version_id, lines)
612
self._check_versions_present(parents)
613
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
615
def _check_add(self, version_id, lines):
616
"""check that version_id and lines are safe to add."""
617
assert self.writable, "knit is not opened for write"
618
### FIXME escape. RBC 20060228
619
if contains_whitespace(version_id):
620
raise InvalidRevisionId(version_id, self.filename)
621
if self.has_version(version_id):
622
raise RevisionAlreadyPresent(version_id, self.filename)
623
self._check_lines_not_unicode(lines)
624
self._check_lines_are_lines(lines)
626
def _add(self, version_id, lines, parents, delta, parent_texts):
627
"""Add a set of lines on top of version specified by parents.
629
If delta is true, compress the text as a line-delta against
632
Any versions not present will be converted into ghosts.
634
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
635
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
636
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
637
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
638
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
639
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
640
# +1383 0 8.0370 8.0370 +<len>
641
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
642
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
643
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
644
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
648
if parent_texts is None:
650
for parent in parents:
651
if not self.has_version(parent):
652
ghosts.append(parent)
654
present_parents.append(parent)
656
if delta and not len(present_parents):
659
digest = sha_strings(lines)
662
if lines[-1][-1] != '\n':
663
options.append('no-eol')
664
lines[-1] = lines[-1] + '\n'
666
if len(present_parents) and delta:
667
# To speed the extract of texts the delta chain is limited
668
# to a fixed number of deltas. This should minimize both
669
# I/O and the time spend applying deltas.
671
delta_parents = present_parents
673
parent = delta_parents[0]
674
method = self._index.get_method(parent)
675
if method == 'fulltext':
677
delta_parents = self._index.get_parents(parent)
679
if method == 'line-delta':
682
lines = self.factory.make(lines, version_id)
683
if delta or (self.factory.annotated and len(present_parents) > 0):
684
# Merge annotations from parent texts if so is needed.
685
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
686
delta, self.factory.annotated)
689
options.append('line-delta')
690
store_lines = self.factory.lower_line_delta(delta_hunks)
692
options.append('fulltext')
693
store_lines = self.factory.lower_fulltext(lines)
695
where, size = self._data.add_record(version_id, digest, store_lines)
696
self._index.add_version(version_id, options, where, size, parents)
699
def check(self, progress_bar=None):
700
"""See VersionedFile.check()."""
702
def _clone_text(self, new_version_id, old_version_id, parents):
703
"""See VersionedFile.clone_text()."""
704
# FIXME RBC 20060228 make fast by only inserting an index with null
706
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
708
def get_lines(self, version_id):
709
"""See VersionedFile.get_lines()."""
710
return self.get_line_list([version_id])[0]
712
def _get_record_map(self, version_ids):
713
"""Produce a dictionary of knit records.
715
The keys are version_ids, the values are tuples of (method, content,
717
method is the way the content should be applied.
718
content is a KnitContent object.
719
digest is the SHA1 digest of this version id after all steps are done
720
next is the build-parent of the version, i.e. the leftmost ancestor.
721
If the method is fulltext, next will be None.
723
position_map = self._get_components_positions(version_ids)
724
# c = component_id, m = method, p = position, s = size, n = next
725
records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
727
for component_id, content, digest in \
728
self._data.read_records_iter(records):
729
method, position, size, next = position_map[component_id]
730
record_map[component_id] = method, content, digest, next
734
def get_text(self, version_id):
735
"""See VersionedFile.get_text"""
736
return self.get_texts([version_id])[0]
738
def get_texts(self, version_ids):
739
return [''.join(l) for l in self.get_line_list(version_ids)]
741
def get_line_list(self, version_ids):
742
"""Return the texts of listed versions as a list of strings."""
743
text_map, content_map = self._get_content_maps(version_ids)
744
return [text_map[v] for v in version_ids]
746
def _get_content_maps(self, version_ids):
747
"""Produce maps of text and KnitContents
749
:return: (text_map, content_map) where text_map contains the texts for
750
the requested versions and content_map contains the KnitContents.
751
Both dicts take version_ids as their keys.
753
for version_id in version_ids:
754
if not self.has_version(version_id):
755
raise RevisionNotPresent(version_id, self.filename)
756
record_map = self._get_record_map(version_ids)
761
for version_id in version_ids:
764
while cursor is not None:
765
method, data, digest, next = record_map[cursor]
766
components.append((cursor, method, data, digest))
767
if cursor in content_map:
772
for component_id, method, data, digest in reversed(components):
773
if component_id in content_map:
774
content = content_map[component_id]
776
version_idx = self._index.lookup(component_id)
777
if method == 'fulltext':
778
assert content is None
779
content = self.factory.parse_fulltext(data, version_idx)
780
elif method == 'line-delta':
781
delta = self.factory.parse_line_delta(data[:],
783
content = content.copy()
784
content._lines = self._apply_delta(content._lines,
786
content_map[component_id] = content
788
if 'no-eol' in self._index.get_options(version_id):
789
content = content.copy()
790
line = content._lines[-1][1].rstrip('\n')
791
content._lines[-1] = (content._lines[-1][0], line)
792
final_content[version_id] = content
794
# digest here is the digest from the last applied component.
795
text = content.text()
796
if sha_strings(text) != digest:
797
raise KnitCorrupt(self.filename,
798
'sha-1 does not match %s' % version_id)
800
text_map[version_id] = text
801
return text_map, final_content
803
def iter_lines_added_or_present_in_versions(self, version_ids=None,
805
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
806
if version_ids is None:
807
version_ids = self.versions()
809
pb = progress.DummyProgress()
810
# we don't care about inclusions, the caller cares.
811
# but we need to setup a list of records to visit.
812
# we need version_id, position, length
813
version_id_records = []
814
requested_versions = list(version_ids)
815
# filter for available versions
816
for version_id in requested_versions:
817
if not self.has_version(version_id):
818
raise RevisionNotPresent(version_id, self.filename)
819
# get a in-component-order queue:
821
for version_id in self.versions():
822
if version_id in requested_versions:
823
version_ids.append(version_id)
824
data_pos, length = self._index.get_position(version_id)
825
version_id_records.append((version_id, data_pos, length))
828
total = len(version_id_records)
829
pb.update('Walking content.', count, total)
830
for version_id, data, sha_value in \
831
self._data.read_records_iter(version_id_records):
832
pb.update('Walking content.', count, total)
833
method = self._index.get_method(version_id)
834
version_idx = self._index.lookup(version_id)
835
assert method in ('fulltext', 'line-delta')
836
if method == 'fulltext':
837
content = self.factory.parse_fulltext(data, version_idx)
838
for line in content.text():
841
delta = self.factory.parse_line_delta(data, version_idx)
842
for start, end, count, lines in delta:
843
for origin, line in lines:
846
pb.update('Walking content.', total, total)
848
def num_versions(self):
849
"""See VersionedFile.num_versions()."""
850
return self._index.num_versions()
852
__len__ = num_versions
854
def annotate_iter(self, version_id):
855
"""See VersionedFile.annotate_iter."""
856
content = self._get_content(version_id)
857
for origin, text in content.annotate_iter():
860
def get_parents(self, version_id):
861
"""See VersionedFile.get_parents."""
864
# 52554 calls in 1264 872 internal down from 3674
866
return self._index.get_parents(version_id)
868
raise RevisionNotPresent(version_id, self.filename)
870
def get_parents_with_ghosts(self, version_id):
871
"""See VersionedFile.get_parents."""
873
return self._index.get_parents_with_ghosts(version_id)
875
raise RevisionNotPresent(version_id, self.filename)
877
def get_ancestry(self, versions):
878
"""See VersionedFile.get_ancestry."""
879
if isinstance(versions, basestring):
880
versions = [versions]
883
self._check_versions_present(versions)
884
return self._index.get_ancestry(versions)
886
def get_ancestry_with_ghosts(self, versions):
887
"""See VersionedFile.get_ancestry_with_ghosts."""
888
if isinstance(versions, basestring):
889
versions = [versions]
892
self._check_versions_present(versions)
893
return self._index.get_ancestry_with_ghosts(versions)
895
#@deprecated_method(zero_eight)
896
def walk(self, version_ids):
897
"""See VersionedFile.walk."""
898
# We take the short path here, and extract all relevant texts
899
# and put them in a weave and let that do all the work. Far
900
# from optimal, but is much simpler.
901
# FIXME RB 20060228 this really is inefficient!
902
from bzrlib.weave import Weave
904
w = Weave(self.filename)
905
ancestry = self.get_ancestry(version_ids)
906
sorted_graph = topo_sort(self._index.get_graph())
907
version_list = [vid for vid in sorted_graph if vid in ancestry]
909
for version_id in version_list:
910
lines = self.get_lines(version_id)
911
w.add_lines(version_id, self.get_parents(version_id), lines)
913
for lineno, insert_id, dset, line in w.walk(version_ids):
914
yield lineno, insert_id, dset, line
916
def plan_merge(self, ver_a, ver_b):
917
"""See VersionedFile.plan_merge."""
918
ancestors_b = set(self.get_ancestry(ver_b))
919
def status_a(revision, text):
920
if revision in ancestors_b:
921
return 'killed-b', text
925
ancestors_a = set(self.get_ancestry(ver_a))
926
def status_b(revision, text):
927
if revision in ancestors_a:
928
return 'killed-a', text
932
annotated_a = self.annotate(ver_a)
933
annotated_b = self.annotate(ver_b)
934
plain_a = [t for (a, t) in annotated_a]
935
plain_b = [t for (a, t) in annotated_b]
936
blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
939
for ai, bi, l in blocks:
940
# process all mismatched sections
941
# (last mismatched section is handled because blocks always
942
# includes a 0-length last block)
943
for revision, text in annotated_a[a_cur:ai]:
944
yield status_a(revision, text)
945
for revision, text in annotated_b[b_cur:bi]:
946
yield status_b(revision, text)
948
# and now the matched section
951
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
952
assert text_a == text_b
953
yield "unchanged", text_a
956
class _KnitComponentFile(object):
957
"""One of the files used to implement a knit database"""
959
def __init__(self, transport, filename, mode, file_mode=None,
960
create_parent_dir=False, dir_mode=None):
961
self._transport = transport
962
self._filename = filename
964
self._file_mode = file_mode
965
self._dir_mode = dir_mode
966
self._create_parent_dir = create_parent_dir
967
self._need_to_create = False
969
def check_header(self, fp):
971
if line != self.HEADER:
972
raise KnitHeaderError(badline=line)
975
"""Commit is a nop."""
978
return '%s(%s)' % (self.__class__.__name__, self._filename)
981
class _KnitIndex(_KnitComponentFile):
982
"""Manages knit index file.
984
The index is already kept in memory and read on startup, to enable
985
fast lookups of revision information. The cursor of the index
986
file is always pointing to the end, making it easy to append
989
_cache is a cache for fast mapping from version id to a Index
992
_history is a cache for fast mapping from indexes to version ids.
994
The index data format is dictionary compressed when it comes to
995
parent references; a index entry may only have parents that with a
996
lover index number. As a result, the index is topological sorted.
998
Duplicate entries may be written to the index for a single version id
999
if this is done then the latter one completely replaces the former:
1000
this allows updates to correct version and parent information.
1001
Note that the two entries may share the delta, and that successive
1002
annotations and references MUST point to the first entry.
1004
The index file on disc contains a header, followed by one line per knit
1005
record. The same revision can be present in an index file more than once.
1006
The first occurrence gets assigned a sequence number starting from 0.
1008
The format of a single line is
1009
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
1010
REVISION_ID is a utf8-encoded revision id
1011
FLAGS is a comma separated list of flags about the record. Values include
1012
no-eol, line-delta, fulltext.
1013
BYTE_OFFSET is the ascii representation of the byte offset in the data file
1014
that the the compressed data starts at.
1015
LENGTH is the ascii representation of the length of the data file.
1016
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
1018
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
1019
revision id already in the knit that is a parent of REVISION_ID.
1020
The ' :' marker is the end of record marker.
1023
when a write is interrupted to the index file, it will result in a line that
1024
does not end in ' :'. If the ' :' is not present at the end of a line, or at
1025
the end of the file, then the record that is missing it will be ignored by
1028
When writing new records to the index file, the data is preceded by '\n'
1029
to ensure that records always start on new lines even if the last write was
1030
interrupted. As a result its normal for the last line in the index to be
1031
missing a trailing newline. One can be added with no harmful effects.
1034
HEADER = "# bzr knit index 8\n"
1036
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
1037
# __slots__ = ['_cache', '_history', '_transport', '_filename']
1039
def _cache_version(self, version_id, options, pos, size, parents):
1040
"""Cache a version record in the history array and index cache.
1042
This is inlined into __init__ for performance. KEEP IN SYNC.
1043
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
1046
# only want the _history index to reference the 1st index entry
1048
if version_id not in self._cache:
1049
index = len(self._history)
1050
self._history.append(version_id)
1052
index = self._cache[version_id][5]
1053
self._cache[version_id] = (version_id,
1060
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1061
create_parent_dir=False, delay_create=False, dir_mode=None):
1062
_KnitComponentFile.__init__(self, transport, filename, mode,
1063
file_mode=file_mode,
1064
create_parent_dir=create_parent_dir,
1067
# position in _history is the 'official' index for a revision
1068
# but the values may have come from a newer entry.
1069
# so - wc -l of a knit index is != the number of unique names
1072
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1077
pb.update('read knit index', count, total)
1078
fp = self._transport.get(self._filename)
1080
self.check_header(fp)
1081
# readlines reads the whole file at once:
1082
# bad for transports like http, good for local disk
1083
# we save 60 ms doing this one change (
1084
# from calling readline each time to calling
1086
# probably what we want for nice behaviour on
1087
# http is a incremental readlines that yields, or
1088
# a check for local vs non local indexes,
1089
for l in fp.readlines():
1091
if len(rec) < 5 or rec[-1] != ':':
1093
# FIXME: in the future we should determine if its a
1094
# short write - and ignore it
1095
# or a different failure, and raise. RBC 20060407
1099
#pb.update('read knit index', count, total)
1100
# See self._parse_parents
1102
for value in rec[4:-1]:
1104
# uncompressed reference
1105
parents.append(value[1:])
1107
# this is 15/4000ms faster than isinstance,
1109
# this function is called thousands of times a
1110
# second so small variations add up.
1111
assert value.__class__ is str
1112
parents.append(self._history[int(value)])
1113
# end self._parse_parents
1114
# self._cache_version(rec[0],
1115
# rec[1].split(','),
1119
# --- self._cache_version
1120
# only want the _history index to reference the 1st
1121
# index entry for version_id
1123
if version_id not in self._cache:
1124
index = len(self._history)
1125
self._history.append(version_id)
1127
index = self._cache[version_id][5]
1128
self._cache[version_id] = (version_id,
1134
# --- self._cache_version
1137
except NoSuchFile, e:
1138
if mode != 'w' or not create:
1141
self._need_to_create = True
1143
self._transport.put_bytes_non_atomic(self._filename,
1144
self.HEADER, mode=self._file_mode)
1147
pb.update('read knit index', total, total)
1150
def _parse_parents(self, compressed_parents):
1151
"""convert a list of string parent values into version ids.
1153
ints are looked up in the index.
1154
.FOO values are ghosts and converted in to FOO.
1156
NOTE: the function is retained here for clarity, and for possible
1157
use in partial index reads. However bulk processing now has
1158
it inlined in __init__ for inner-loop optimisation.
1161
for value in compressed_parents:
1162
if value[-1] == '.':
1163
# uncompressed reference
1164
result.append(value[1:])
1166
# this is 15/4000ms faster than isinstance,
1167
# this function is called thousands of times a
1168
# second so small variations add up.
1169
assert value.__class__ is str
1170
result.append(self._history[int(value)])
1173
def get_graph(self):
1175
for version_id, index in self._cache.iteritems():
1176
graph.append((version_id, index[4]))
1179
def get_ancestry(self, versions):
1180
"""See VersionedFile.get_ancestry."""
1181
# get a graph of all the mentioned versions:
1183
pending = set(versions)
1185
version = pending.pop()
1186
parents = self._cache[version][4]
1187
# got the parents ok
1189
parents = [parent for parent in parents if parent in self._cache]
1190
for parent in parents:
1191
# if not completed and not a ghost
1192
if parent not in graph:
1194
graph[version] = parents
1195
return topo_sort(graph.items())
1197
def get_ancestry_with_ghosts(self, versions):
1198
"""See VersionedFile.get_ancestry_with_ghosts."""
1199
# get a graph of all the mentioned versions:
1201
pending = set(versions)
1203
version = pending.pop()
1205
parents = self._cache[version][4]
1211
# got the parents ok
1212
for parent in parents:
1213
if parent not in graph:
1215
graph[version] = parents
1216
return topo_sort(graph.items())
1218
def num_versions(self):
1219
return len(self._history)
1221
__len__ = num_versions
1223
def get_versions(self):
1224
return self._history
1226
def idx_to_name(self, idx):
1227
return self._history[idx]
1229
def lookup(self, version_id):
1230
assert version_id in self._cache
1231
return self._cache[version_id][5]
1233
def _version_list_to_index(self, versions):
1234
encode_utf8 = cache_utf8.encode
1236
for version in versions:
1237
if version in self._cache:
1238
# -- inlined lookup() --
1239
result_list.append(str(self._cache[version][5]))
1240
# -- end lookup () --
1242
result_list.append('.' + encode_utf8(version))
1243
return ' '.join(result_list)
1245
def add_version(self, version_id, options, pos, size, parents):
1246
"""Add a version record to the index."""
1247
self.add_versions(((version_id, options, pos, size, parents),))
1249
def add_versions(self, versions):
1250
"""Add multiple versions to the index.
1252
:param versions: a list of tuples:
1253
(version_id, options, pos, size, parents).
1256
encode_utf8 = cache_utf8.encode
1257
for version_id, options, pos, size, parents in versions:
1258
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1262
self._version_list_to_index(parents))
1263
assert isinstance(line, str), \
1264
'content must be utf-8 encoded: %r' % (line,)
1266
if not self._need_to_create:
1267
self._transport.append_bytes(self._filename, ''.join(lines))
1270
sio.write(self.HEADER)
1271
sio.writelines(lines)
1273
self._transport.put_file_non_atomic(self._filename, sio,
1274
create_parent_dir=self._create_parent_dir,
1275
mode=self._file_mode,
1276
dir_mode=self._dir_mode)
1277
self._need_to_create = False
1279
# cache after writing, so that a failed write leads to missing cache
1280
# entries not extra ones. XXX TODO: RBC 20060502 in the event of a
1281
# failure, reload the index or flush it or some such, to prevent
1282
# writing records that did complete twice.
1283
for version_id, options, pos, size, parents in versions:
1284
self._cache_version(version_id, options, pos, size, parents)
1286
def has_version(self, version_id):
1287
"""True if the version is in the index."""
1288
return (version_id in self._cache)
1290
def get_position(self, version_id):
1291
"""Return data position and size of specified version."""
1292
return (self._cache[version_id][2], \
1293
self._cache[version_id][3])
1295
def get_method(self, version_id):
1296
"""Return compression method of specified version."""
1297
options = self._cache[version_id][1]
1298
if 'fulltext' in options:
1301
assert 'line-delta' in options
1304
def get_options(self, version_id):
1305
return self._cache[version_id][1]
1307
def get_parents(self, version_id):
1308
"""Return parents of specified version ignoring ghosts."""
1309
return [parent for parent in self._cache[version_id][4]
1310
if parent in self._cache]
1312
def get_parents_with_ghosts(self, version_id):
1313
"""Return parents of specified version with ghosts."""
1314
return self._cache[version_id][4]
1316
def check_versions_present(self, version_ids):
1317
"""Check that all specified versions are present."""
1318
version_ids = set(version_ids)
1319
for version_id in list(version_ids):
1320
if version_id in self._cache:
1321
version_ids.remove(version_id)
1323
raise RevisionNotPresent(list(version_ids)[0], self.filename)
1326
class _KnitData(_KnitComponentFile):
1327
"""Contents of the knit data file"""
1329
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1330
create_parent_dir=False, delay_create=False,
1332
_KnitComponentFile.__init__(self, transport, filename, mode,
1333
file_mode=file_mode,
1334
create_parent_dir=create_parent_dir,
1336
self._checked = False
1337
# TODO: jam 20060713 conceptually, this could spill to disk
1338
# if the cached size gets larger than a certain amount
1339
# but it complicates the model a bit, so for now just use
1340
# a simple dictionary
1342
self._do_cache = False
1345
self._need_to_create = create
1347
self._transport.put_bytes_non_atomic(self._filename, '',
1348
mode=self._file_mode)
1350
def enable_cache(self):
1351
"""Enable caching of reads."""
1352
self._do_cache = True
1354
def clear_cache(self):
1355
"""Clear the record cache."""
1356
self._do_cache = False
1359
def _open_file(self):
1361
return self._transport.get(self._filename)
1366
def _record_to_data(self, version_id, digest, lines):
1367
"""Convert version_id, digest, lines into a raw data block.
1369
:return: (len, a StringIO instance with the raw data ready to read.)
1372
data_file = GzipFile(None, mode='wb', fileobj=sio)
1374
version_id_utf8 = cache_utf8.encode(version_id)
1375
data_file.writelines(chain(
1376
["version %s %d %s\n" % (version_id_utf8,
1380
["end %s\n" % version_id_utf8]))
1387
def add_raw_record(self, raw_data):
1388
"""Append a prepared record to the data file.
1390
:return: the offset in the data file raw_data was written.
1392
assert isinstance(raw_data, str), 'data must be plain bytes'
1393
if not self._need_to_create:
1394
return self._transport.append_bytes(self._filename, raw_data)
1396
self._transport.put_bytes_non_atomic(self._filename, raw_data,
1397
create_parent_dir=self._create_parent_dir,
1398
mode=self._file_mode,
1399
dir_mode=self._dir_mode)
1400
self._need_to_create = False
1403
def add_record(self, version_id, digest, lines):
1404
"""Write new text record to disk. Returns the position in the
1405
file where it was written."""
1406
size, sio = self._record_to_data(version_id, digest, lines)
1408
if not self._need_to_create:
1409
start_pos = self._transport.append_file(self._filename, sio)
1411
self._transport.put_file_non_atomic(self._filename, sio,
1412
create_parent_dir=self._create_parent_dir,
1413
mode=self._file_mode,
1414
dir_mode=self._dir_mode)
1415
self._need_to_create = False
1418
self._cache[version_id] = sio.getvalue()
1419
return start_pos, size
1421
def _parse_record_header(self, version_id, raw_data):
1422
"""Parse a record header for consistency.
1424
:return: the header and the decompressor stream.
1425
as (stream, header_record)
1427
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1428
rec = df.readline().split()
1430
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1431
if cache_utf8.decode(rec[1]) != version_id:
1432
raise KnitCorrupt(self._filename,
1433
'unexpected version, wanted %r, got %r' % (
1434
version_id, rec[1]))
1437
def _parse_record(self, version_id, data):
1439
# 4168 calls in 2880 217 internal
1440
# 4168 calls to _parse_record_header in 2121
1441
# 4168 calls to readlines in 330
1442
df, rec = self._parse_record_header(version_id, data)
1443
record_contents = df.readlines()
1444
l = record_contents.pop()
1445
assert len(record_contents) == int(rec[2])
1446
if l != 'end %s\n' % cache_utf8.encode(version_id):
1447
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1450
return record_contents, rec[3]
1452
def read_records_iter_raw(self, records):
1453
"""Read text records from data file and yield raw data.
1455
This unpacks enough of the text record to validate the id is
1456
as expected but thats all.
1458
# setup an iterator of the external records:
1459
# uses readv so nice and fast we hope.
1461
# grab the disk data needed.
1463
# Don't check _cache if it is empty
1464
needed_offsets = [(pos, size) for version_id, pos, size
1466
if version_id not in self._cache]
1468
needed_offsets = [(pos, size) for version_id, pos, size
1471
raw_records = self._transport.readv(self._filename, needed_offsets)
1474
for version_id, pos, size in records:
1475
if version_id in self._cache:
1476
# This data has already been validated
1477
data = self._cache[version_id]
1479
pos, data = raw_records.next()
1481
self._cache[version_id] = data
1483
# validate the header
1484
df, rec = self._parse_record_header(version_id, data)
1486
yield version_id, data
1488
def read_records_iter(self, records):
1489
"""Read text records from data file and yield result.
1491
The result will be returned in whatever is the fastest to read.
1492
Not by the order requested. Also, multiple requests for the same
1493
record will only yield 1 response.
1494
:param records: A list of (version_id, pos, len) entries
1495
:return: Yields (version_id, contents, digest) in the order
1496
read, not the order requested
1502
# Skip records we have alread seen
1503
yielded_records = set()
1504
needed_records = set()
1505
for record in records:
1506
if record[0] in self._cache:
1507
if record[0] in yielded_records:
1509
yielded_records.add(record[0])
1510
data = self._cache[record[0]]
1511
content, digest = self._parse_record(record[0], data)
1512
yield (record[0], content, digest)
1514
needed_records.add(record)
1515
needed_records = sorted(needed_records, key=operator.itemgetter(1))
1517
needed_records = sorted(set(records), key=operator.itemgetter(1))
1519
if not needed_records:
1522
# The transport optimizes the fetching as well
1523
# (ie, reads continuous ranges.)
1524
readv_response = self._transport.readv(self._filename,
1525
[(pos, size) for version_id, pos, size in needed_records])
1527
for (version_id, pos, size), (pos, data) in \
1528
izip(iter(needed_records), readv_response):
1529
content, digest = self._parse_record(version_id, data)
1531
self._cache[version_id] = data
1532
yield version_id, content, digest
1534
def read_records(self, records):
1535
"""Read records into a dictionary."""
1537
for record_id, content, digest in \
1538
self.read_records_iter(records):
1539
components[record_id] = (content, digest)
1543
class InterKnit(InterVersionedFile):
1544
"""Optimised code paths for knit to knit operations."""
1546
_matching_file_from_factory = KnitVersionedFile
1547
_matching_file_to_factory = KnitVersionedFile
1550
def is_compatible(source, target):
1551
"""Be compatible with knits. """
1553
return (isinstance(source, KnitVersionedFile) and
1554
isinstance(target, KnitVersionedFile))
1555
except AttributeError:
1558
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1559
"""See InterVersionedFile.join."""
1560
assert isinstance(self.source, KnitVersionedFile)
1561
assert isinstance(self.target, KnitVersionedFile)
1563
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1568
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1570
version_ids = list(version_ids)
1571
if None in version_ids:
1572
version_ids.remove(None)
1574
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1575
this_versions = set(self.target._index.get_versions())
1576
needed_versions = self.source_ancestry - this_versions
1577
cross_check_versions = self.source_ancestry.intersection(this_versions)
1578
mismatched_versions = set()
1579
for version in cross_check_versions:
1580
# scan to include needed parents.
1581
n1 = set(self.target.get_parents_with_ghosts(version))
1582
n2 = set(self.source.get_parents_with_ghosts(version))
1584
# FIXME TEST this check for cycles being introduced works
1585
# the logic is we have a cycle if in our graph we are an
1586
# ancestor of any of the n2 revisions.
1592
parent_ancestors = self.source.get_ancestry(parent)
1593
if version in parent_ancestors:
1594
raise errors.GraphCycleError([parent, version])
1595
# ensure this parent will be available later.
1596
new_parents = n2.difference(n1)
1597
needed_versions.update(new_parents.difference(this_versions))
1598
mismatched_versions.add(version)
1600
if not needed_versions and not mismatched_versions:
1602
full_list = topo_sort(self.source.get_graph())
1604
version_list = [i for i in full_list if (not self.target.has_version(i)
1605
and i in needed_versions)]
1609
copy_queue_records = []
1611
for version_id in version_list:
1612
options = self.source._index.get_options(version_id)
1613
parents = self.source._index.get_parents_with_ghosts(version_id)
1614
# check that its will be a consistent copy:
1615
for parent in parents:
1616
# if source has the parent, we must :
1617
# * already have it or
1618
# * have it scheduled already
1619
# otherwise we don't care
1620
assert (self.target.has_version(parent) or
1621
parent in copy_set or
1622
not self.source.has_version(parent))
1623
data_pos, data_size = self.source._index.get_position(version_id)
1624
copy_queue_records.append((version_id, data_pos, data_size))
1625
copy_queue.append((version_id, options, parents))
1626
copy_set.add(version_id)
1628
# data suck the join:
1630
total = len(version_list)
1633
for (version_id, raw_data), \
1634
(version_id2, options, parents) in \
1635
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1637
assert version_id == version_id2, 'logic error, inconsistent results'
1639
pb.update("Joining knit", count, total)
1640
raw_records.append((version_id, options, parents, len(raw_data)))
1641
raw_datum.append(raw_data)
1642
self.target._add_raw_records(raw_records, ''.join(raw_datum))
1644
for version in mismatched_versions:
1645
# FIXME RBC 20060309 is this needed?
1646
n1 = set(self.target.get_parents_with_ghosts(version))
1647
n2 = set(self.source.get_parents_with_ghosts(version))
1648
# write a combined record to our history preserving the current
1649
# parents as first in the list
1650
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1651
self.target.fix_parents(version, new_parents)
1657
InterVersionedFile.register_optimiser(InterKnit)
1660
class WeaveToKnit(InterVersionedFile):
1661
"""Optimised code paths for weave to knit operations."""
1663
_matching_file_from_factory = bzrlib.weave.WeaveFile
1664
_matching_file_to_factory = KnitVersionedFile
1667
def is_compatible(source, target):
1668
"""Be compatible with weaves to knits."""
1670
return (isinstance(source, bzrlib.weave.Weave) and
1671
isinstance(target, KnitVersionedFile))
1672
except AttributeError:
1675
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1676
"""See InterVersionedFile.join."""
1677
assert isinstance(self.source, bzrlib.weave.Weave)
1678
assert isinstance(self.target, KnitVersionedFile)
1680
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1685
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1687
version_ids = list(version_ids)
1689
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1690
this_versions = set(self.target._index.get_versions())
1691
needed_versions = self.source_ancestry - this_versions
1692
cross_check_versions = self.source_ancestry.intersection(this_versions)
1693
mismatched_versions = set()
1694
for version in cross_check_versions:
1695
# scan to include needed parents.
1696
n1 = set(self.target.get_parents_with_ghosts(version))
1697
n2 = set(self.source.get_parents(version))
1698
# if all of n2's parents are in n1, then its fine.
1699
if n2.difference(n1):
1700
# FIXME TEST this check for cycles being introduced works
1701
# the logic is we have a cycle if in our graph we are an
1702
# ancestor of any of the n2 revisions.
1708
parent_ancestors = self.source.get_ancestry(parent)
1709
if version in parent_ancestors:
1710
raise errors.GraphCycleError([parent, version])
1711
# ensure this parent will be available later.
1712
new_parents = n2.difference(n1)
1713
needed_versions.update(new_parents.difference(this_versions))
1714
mismatched_versions.add(version)
1716
if not needed_versions and not mismatched_versions:
1718
full_list = topo_sort(self.source.get_graph())
1720
version_list = [i for i in full_list if (not self.target.has_version(i)
1721
and i in needed_versions)]
1725
total = len(version_list)
1726
for version_id in version_list:
1727
pb.update("Converting to knit", count, total)
1728
parents = self.source.get_parents(version_id)
1729
# check that its will be a consistent copy:
1730
for parent in parents:
1731
# if source has the parent, we must already have it
1732
assert (self.target.has_version(parent))
1733
self.target.add_lines(
1734
version_id, parents, self.source.get_lines(version_id))
1737
for version in mismatched_versions:
1738
# FIXME RBC 20060309 is this needed?
1739
n1 = set(self.target.get_parents_with_ghosts(version))
1740
n2 = set(self.source.get_parents(version))
1741
# write a combined record to our history preserving the current
1742
# parents as first in the list
1743
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1744
self.target.fix_parents(version, new_parents)
1750
InterVersionedFile.register_optimiser(WeaveToKnit)
1753
class KnitSequenceMatcher(difflib.SequenceMatcher):
1754
"""Knit tuned sequence matcher.
1756
This is based on profiling of difflib which indicated some improvements
1757
for our usage pattern.
1760
def find_longest_match(self, alo, ahi, blo, bhi):
1761
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1763
If isjunk is not defined:
1765
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1766
alo <= i <= i+k <= ahi
1767
blo <= j <= j+k <= bhi
1768
and for all (i',j',k') meeting those conditions,
1771
and if i == i', j <= j'
1773
In other words, of all maximal matching blocks, return one that
1774
starts earliest in a, and of all those maximal matching blocks that
1775
start earliest in a, return the one that starts earliest in b.
1777
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1778
>>> s.find_longest_match(0, 5, 0, 9)
1781
If isjunk is defined, first the longest matching block is
1782
determined as above, but with the additional restriction that no
1783
junk element appears in the block. Then that block is extended as
1784
far as possible by matching (only) junk elements on both sides. So
1785
the resulting block never matches on junk except as identical junk
1786
happens to be adjacent to an "interesting" match.
1788
Here's the same example as before, but considering blanks to be
1789
junk. That prevents " abcd" from matching the " abcd" at the tail
1790
end of the second sequence directly. Instead only the "abcd" can
1791
match, and matches the leftmost "abcd" in the second sequence:
1793
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1794
>>> s.find_longest_match(0, 5, 0, 9)
1797
If no blocks match, return (alo, blo, 0).
1799
>>> s = SequenceMatcher(None, "ab", "c")
1800
>>> s.find_longest_match(0, 2, 0, 1)
1804
# CAUTION: stripping common prefix or suffix would be incorrect.
1808
# Longest matching block is "ab", but if common prefix is
1809
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1810
# strip, so ends up claiming that ab is changed to acab by
1811
# inserting "ca" in the middle. That's minimal but unintuitive:
1812
# "it's obvious" that someone inserted "ac" at the front.
1813
# Windiff ends up at the same place as diff, but by pairing up
1814
# the unique 'b's and then matching the first two 'a's.
1816
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1817
besti, bestj, bestsize = alo, blo, 0
1818
# find longest junk-free match
1819
# during an iteration of the loop, j2len[j] = length of longest
1820
# junk-free match ending with a[i-1] and b[j]
1824
for i in xrange(alo, ahi):
1825
# look at all instances of a[i] in b; note that because
1826
# b2j has no junk keys, the loop is skipped if a[i] is junk
1827
j2lenget = j2len.get
1830
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1831
# following improvement
1832
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1833
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1834
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1836
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1837
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1838
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1850
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1852
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1855
# Extend the best by non-junk elements on each end. In particular,
1856
# "popular" non-junk elements aren't in b2j, which greatly speeds
1857
# the inner loop above, but also means "the best" match so far
1858
# doesn't contain any junk *or* popular non-junk elements.
1859
while besti > alo and bestj > blo and \
1860
not isbjunk(b[bestj-1]) and \
1861
a[besti-1] == b[bestj-1]:
1862
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1863
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1864
not isbjunk(b[bestj+bestsize]) and \
1865
a[besti+bestsize] == b[bestj+bestsize]:
1868
# Now that we have a wholly interesting match (albeit possibly
1869
# empty!), we may as well suck up the matching junk on each
1870
# side of it too. Can't think of a good reason not to, and it
1871
# saves post-processing the (possibly considerable) expense of
1872
# figuring out what to do with it. In the case of an empty
1873
# interesting match, this is clearly the right thing to do,
1874
# because no other kind of match is possible in the regions.
1875
while besti > alo and bestj > blo and \
1876
isbjunk(b[bestj-1]) and \
1877
a[besti-1] == b[bestj-1]:
1878
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1879
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1880
isbjunk(b[bestj+bestsize]) and \
1881
a[besti+bestsize] == b[bestj+bestsize]:
1882
bestsize = bestsize + 1
1884
return besti, bestj, bestsize