1
# Copyright (C) 2005, 2006 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Knit versionedfile implementation.
19
A knit is a versioned file implementation that supports efficient append only
23
lifeless: the data file is made up of "delta records". each delta record has a delta header
24
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
25
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
26
end-marker; simply "end VERSION"
28
delta can be line or full contents.a
29
... the 8's there are the index number of the annotation.
30
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
34
8 e.set('executable', 'yes')
36
8 if elt.get('executable') == 'yes':
37
8 ie.executable = True
38
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
42
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
43
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
44
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
45
09:33 < lifeless> right
46
09:33 < jrydberg> lifeless: the position and size is the range in the data file
49
so the index sequence is the dictionary compressed sequence number used
50
in the deltas to provide line annotation
55
# 10:16 < lifeless> make partial index writes safe
56
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
57
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
59
# move sha1 out of the content so that join is faster at verifying parents
60
# record content length ?
64
from cStringIO import StringIO
66
from itertools import izip, chain
80
from bzrlib.errors import (
88
RevisionAlreadyPresent,
90
from bzrlib.tuned_gzip import GzipFile
91
from bzrlib.trace import mutter
92
from bzrlib.osutils import (
97
from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed
98
from bzrlib.tsort import topo_sort
101
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
104
# TODO: Split out code specific to this format into an associated object.
106
# TODO: Can we put in some kind of value to check that the index and data
107
# files belong together?
109
# TODO: accommodate binaries, perhaps by storing a byte count
111
# TODO: function to check whole file
113
# TODO: atomically append data, then measure backwards from the cursor
114
# position after writing to work out where it was located. we may need to
115
# bypass python file buffering.
117
DATA_SUFFIX = '.knit'
118
INDEX_SUFFIX = '.kndx'
121
class KnitContent(object):
122
"""Content of a knit version to which deltas can be applied."""
124
def __init__(self, lines):
127
def annotate_iter(self):
128
"""Yield tuples of (origin, text) for each content line."""
129
return iter(self._lines)
132
"""Return a list of (origin, text) tuples."""
133
return list(self.annotate_iter())
135
def line_delta_iter(self, new_lines):
136
"""Generate line-based delta from this content to new_lines."""
137
new_texts = new_lines.text()
138
old_texts = self.text()
139
s = KnitSequenceMatcher(None, old_texts, new_texts)
140
for tag, i1, i2, j1, j2 in s.get_opcodes():
143
# ofrom, oto, length, data
144
yield i1, i2, j2 - j1, new_lines._lines[j1:j2]
146
def line_delta(self, new_lines):
147
return list(self.line_delta_iter(new_lines))
150
return [text for origin, text in self._lines]
153
return KnitContent(self._lines[:])
156
class _KnitFactory(object):
157
"""Base factory for creating content objects."""
159
def make(self, lines, version):
160
num_lines = len(lines)
161
return KnitContent(zip([version] * num_lines, lines))
164
class KnitAnnotateFactory(_KnitFactory):
165
"""Factory for creating annotated Content objects."""
169
def parse_fulltext(self, content, version):
170
"""Convert fulltext to internal representation
172
fulltext content is of the format
173
revid(utf8) plaintext\n
174
internal representation is of the format:
177
decode_utf8 = cache_utf8.decode
180
origin, text = line.split(' ', 1)
181
lines.append((decode_utf8(origin), text))
182
return KnitContent(lines)
184
def parse_line_delta_iter(self, lines):
185
return iter(self.parse_line_delta(lines))
187
def parse_line_delta(self, lines, version):
188
"""Convert a line based delta into internal representation.
190
line delta is in the form of:
191
intstart intend intcount
193
revid(utf8) newline\n
194
internal representation is
195
(start, end, count, [1..count tuples (revid, newline)])
197
decode_utf8 = cache_utf8.decode
201
# walk through the lines parsing.
203
start, end, count = [int(n) for n in header.split(',')]
207
origin, text = next().split(' ', 1)
209
contents.append((decode_utf8(origin), text))
210
result.append((start, end, count, contents))
213
def get_fulltext_content(self, lines):
214
"""Extract just the content lines from a fulltext."""
215
return (line.split(' ', 1)[1] for line in lines)
217
def get_linedelta_content(self, lines):
218
"""Extract just the content from a line delta.
220
This doesn't return all of the extra information stored in a delta.
221
Only the actual content lines.
226
header = header.split(',')
227
count = int(header[2])
228
for i in xrange(count):
229
origin, text = next().split(' ', 1)
232
def lower_fulltext(self, content):
233
"""convert a fulltext content record into a serializable form.
235
see parse_fulltext which this inverts.
237
encode_utf8 = cache_utf8.encode
238
return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]
240
def lower_line_delta(self, delta):
241
"""convert a delta into a serializable form.
243
See parse_line_delta which this inverts.
245
encode_utf8 = cache_utf8.encode
247
for start, end, c, lines in delta:
248
out.append('%d,%d,%d\n' % (start, end, c))
249
out.extend(encode_utf8(origin) + ' ' + text
250
for origin, text in lines)
254
class KnitPlainFactory(_KnitFactory):
255
"""Factory for creating plain Content objects."""
259
def parse_fulltext(self, content, version):
260
"""This parses an unannotated fulltext.
262
Note that this is not a noop - the internal representation
263
has (versionid, line) - its just a constant versionid.
265
return self.make(content, version)
267
def parse_line_delta_iter(self, lines, version):
269
num_lines = len(lines)
270
while cur < num_lines:
273
start, end, c = [int(n) for n in header.split(',')]
274
yield start, end, c, zip([version] * c, lines[cur:cur+c])
277
def parse_line_delta(self, lines, version):
278
return list(self.parse_line_delta_iter(lines, version))
280
def get_fulltext_content(self, lines):
281
"""Extract just the content lines from a fulltext."""
284
def get_linedelta_content(self, lines):
285
"""Extract just the content from a line delta.
287
This doesn't return all of the extra information stored in a delta.
288
Only the actual content lines.
293
header = header.split(',')
294
count = int(header[2])
295
for i in xrange(count):
298
def lower_fulltext(self, content):
299
return content.text()
301
def lower_line_delta(self, delta):
303
for start, end, c, lines in delta:
304
out.append('%d,%d,%d\n' % (start, end, c))
305
out.extend([text for origin, text in lines])
309
def make_empty_knit(transport, relpath):
310
"""Construct a empty knit at the specified location."""
311
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
315
class KnitVersionedFile(VersionedFile):
316
"""Weave-like structure with faster random access.
318
A knit stores a number of texts and a summary of the relationships
319
between them. Texts are identified by a string version-id. Texts
320
are normally stored and retrieved as a series of lines, but can
321
also be passed as single strings.
323
Lines are stored with the trailing newline (if any) included, to
324
avoid special cases for files with no final newline. Lines are
325
composed of 8-bit characters, not unicode. The combination of
326
these approaches should mean any 'binary' file can be safely
327
stored and retrieved.
330
def __init__(self, relpath, transport, file_mode=None, access_mode=None,
331
factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,
332
create=False, create_parent_dir=False, delay_create=False,
334
"""Construct a knit at location specified by relpath.
336
:param create: If not True, only open an existing knit.
337
:param create_parent_dir: If True, create the parent directory if
338
creating the file fails. (This is used for stores with
339
hash-prefixes that may not exist yet)
340
:param delay_create: The calling code is aware that the knit won't
341
actually be created until the first data is stored.
343
if deprecated_passed(basis_knit):
344
warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"
345
" deprecated as of bzr 0.9.",
346
DeprecationWarning, stacklevel=2)
347
if access_mode is None:
349
super(KnitVersionedFile, self).__init__(access_mode)
350
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
351
self.transport = transport
352
self.filename = relpath
353
self.factory = factory or KnitAnnotateFactory()
354
self.writable = (access_mode == 'w')
357
self._max_delta_chain = 200
359
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
360
access_mode, create=create, file_mode=file_mode,
361
create_parent_dir=create_parent_dir, delay_create=delay_create,
363
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
364
access_mode, create=create and not len(self), file_mode=file_mode,
365
create_parent_dir=create_parent_dir, delay_create=delay_create,
369
return '%s(%s)' % (self.__class__.__name__,
370
self.transport.abspath(self.filename))
372
def _check_should_delta(self, first_parents):
373
"""Iterate back through the parent listing, looking for a fulltext.
375
This is used when we want to decide whether to add a delta or a new
376
fulltext. It searches for _max_delta_chain parents. When it finds a
377
fulltext parent, it sees if the total size of the deltas leading up to
378
it is large enough to indicate that we want a new full text anyway.
380
Return True if we should create a new delta, False if we should use a
385
delta_parents = first_parents
386
for count in xrange(self._max_delta_chain):
387
parent = delta_parents[0]
388
method = self._index.get_method(parent)
389
pos, size = self._index.get_position(parent)
390
if method == 'fulltext':
394
delta_parents = self._index.get_parents(parent)
396
# We couldn't find a fulltext, so we must create a new one
399
return fulltext_size > delta_size
401
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
402
"""See VersionedFile._add_delta()."""
403
self._check_add(version_id, []) # should we check the lines ?
404
self._check_versions_present(parents)
408
for parent in parents:
409
if not self.has_version(parent):
410
ghosts.append(parent)
412
present_parents.append(parent)
414
if delta_parent is None:
415
# reconstitute as full text.
416
assert len(delta) == 1 or len(delta) == 0
418
assert delta[0][0] == 0
419
assert delta[0][1] == 0, delta[0][1]
420
return super(KnitVersionedFile, self)._add_delta(version_id,
431
options.append('no-eol')
433
if delta_parent is not None:
434
# determine the current delta chain length.
435
# To speed the extract of texts the delta chain is limited
436
# to a fixed number of deltas. This should minimize both
437
# I/O and the time spend applying deltas.
438
# The window was changed to a maximum of 200 deltas, but also added
439
# was a check that the total compressed size of the deltas is
440
# smaller than the compressed size of the fulltext.
441
if not self._check_should_delta([delta_parent]):
442
# We don't want a delta here, just do a normal insertion.
443
return super(KnitVersionedFile, self)._add_delta(version_id,
450
options.append('line-delta')
451
store_lines = self.factory.lower_line_delta(delta)
453
where, size = self._data.add_record(version_id, digest, store_lines)
454
self._index.add_version(version_id, options, where, size, parents)
456
def _add_raw_records(self, records, data):
457
"""Add all the records 'records' with data pre-joined in 'data'.
459
:param records: A list of tuples(version_id, options, parents, size).
460
:param data: The data for the records. When it is written, the records
461
are adjusted to have pos pointing into data by the sum of
462
the preceding records sizes.
465
pos = self._data.add_raw_record(data)
468
for (version_id, options, parents, size) in records:
469
index_entries.append((version_id, options, pos+offset,
471
if self._data._do_cache:
472
self._data._cache[version_id] = data[offset:offset+size]
474
self._index.add_versions(index_entries)
476
def enable_cache(self):
477
"""Start caching data for this knit"""
478
self._data.enable_cache()
480
def clear_cache(self):
481
"""Clear the data cache only."""
482
self._data.clear_cache()
484
def copy_to(self, name, transport):
485
"""See VersionedFile.copy_to()."""
486
# copy the current index to a temp index to avoid racing with local
488
transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',
489
self.transport.get(self._index._filename))
491
f = self._data._open_file()
493
transport.put_file(name + DATA_SUFFIX, f)
496
# move the copied index into place
497
transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
499
def create_empty(self, name, transport, mode=None):
500
return KnitVersionedFile(name, transport, factory=self.factory,
501
delta=self.delta, create=True)
503
def _fix_parents(self, version, new_parents):
504
"""Fix the parents list for version.
506
This is done by appending a new version to the index
507
with identical data except for the parents list.
508
the parents list must be a superset of the current
511
current_values = self._index._cache[version]
512
assert set(current_values[4]).difference(set(new_parents)) == set()
513
self._index.add_version(version,
519
def get_delta(self, version_id):
520
"""Get a delta for constructing version from some other version."""
521
if self.reserved_id(version_id):
522
raise errors.ReservedId(version_id)
523
if not self.has_version(version_id):
524
raise RevisionNotPresent(version_id, self.filename)
526
parents = self.get_parents(version_id)
531
data_pos, data_size = self._index.get_position(version_id)
532
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
533
version_idx = self._index.lookup(version_id)
534
noeol = 'no-eol' in self._index.get_options(version_id)
535
if 'fulltext' == self._index.get_method(version_id):
536
new_content = self.factory.parse_fulltext(data, version_idx)
537
if parent is not None:
538
reference_content = self._get_content(parent)
539
old_texts = reference_content.text()
542
new_texts = new_content.text()
543
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
544
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
546
delta = self.factory.parse_line_delta(data, version_idx)
547
return parent, sha1, noeol, delta
549
def get_graph_with_ghosts(self):
550
"""See VersionedFile.get_graph_with_ghosts()."""
551
graph_items = self._index.get_graph()
552
return dict(graph_items)
554
def get_sha1(self, version_id):
555
"""See VersionedFile.get_sha1()."""
556
record_map = self._get_record_map([version_id])
557
method, content, digest, next = record_map[version_id]
562
"""See VersionedFile.get_suffixes()."""
563
return [DATA_SUFFIX, INDEX_SUFFIX]
565
def has_ghost(self, version_id):
566
"""True if there is a ghost reference in the file to version_id."""
568
if self.has_version(version_id):
570
# optimisable if needed by memoising the _ghosts set.
571
items = self._index.get_graph()
572
for node, parents in items:
573
for parent in parents:
574
if parent not in self._index._cache:
575
if parent == version_id:
580
"""See VersionedFile.versions."""
581
return self._index.get_versions()
583
def has_version(self, version_id):
584
"""See VersionedFile.has_version."""
585
return self._index.has_version(version_id)
587
__contains__ = has_version
589
def _merge_annotations(self, content, parents, parent_texts={},
590
delta=None, annotated=None):
591
"""Merge annotations for content. This is done by comparing
592
the annotations based on changed to the text.
596
for parent_id in parents:
597
merge_content = self._get_content(parent_id, parent_texts)
598
seq = patiencediff.PatienceSequenceMatcher(
599
None, merge_content.text(), content.text())
600
if delta_seq is None:
601
# setup a delta seq to reuse.
603
for i, j, n in seq.get_matching_blocks():
606
# this appears to copy (origin, text) pairs across to the new
607
# content for any line that matches the last-checked parent.
608
# FIXME: save the sequence control data for delta compression
609
# against the most relevant parent rather than rediffing.
610
content._lines[j:j+n] = merge_content._lines[i:i+n]
613
reference_content = self._get_content(parents[0], parent_texts)
614
new_texts = content.text()
615
old_texts = reference_content.text()
616
delta_seq = patiencediff.PatienceSequenceMatcher(
617
None, old_texts, new_texts)
618
return self._make_line_delta(delta_seq, content)
620
def _make_line_delta(self, delta_seq, new_content):
621
"""Generate a line delta from delta_seq and new_content."""
623
for op in delta_seq.get_opcodes():
626
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
629
def _get_components_positions(self, version_ids):
630
"""Produce a map of position data for the components of versions.
632
This data is intended to be used for retrieving the knit records.
634
A dict of version_id to (method, data_pos, data_size, next) is
636
method is the way referenced data should be applied.
637
data_pos is the position of the data in the knit.
638
data_size is the size of the data in the knit.
639
next is the build-parent of the version, or None for fulltexts.
642
for version_id in version_ids:
645
while cursor is not None and cursor not in component_data:
646
method = self._index.get_method(cursor)
647
if method == 'fulltext':
650
next = self.get_parents(cursor)[0]
651
data_pos, data_size = self._index.get_position(cursor)
652
component_data[cursor] = (method, data_pos, data_size, next)
654
return component_data
656
def _get_content(self, version_id, parent_texts={}):
657
"""Returns a content object that makes up the specified
659
if not self.has_version(version_id):
660
raise RevisionNotPresent(version_id, self.filename)
662
cached_version = parent_texts.get(version_id, None)
663
if cached_version is not None:
664
return cached_version
666
text_map, contents_map = self._get_content_maps([version_id])
667
return contents_map[version_id]
669
def _check_versions_present(self, version_ids):
670
"""Check that all specified versions are present."""
671
self._index.check_versions_present(version_ids)
673
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
674
"""See VersionedFile.add_lines_with_ghosts()."""
675
self._check_add(version_id, lines)
676
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
678
def _add_lines(self, version_id, parents, lines, parent_texts):
679
"""See VersionedFile.add_lines."""
680
self._check_add(version_id, lines)
681
self._check_versions_present(parents)
682
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
684
def _check_add(self, version_id, lines):
685
"""check that version_id and lines are safe to add."""
686
assert self.writable, "knit is not opened for write"
687
### FIXME escape. RBC 20060228
688
if contains_whitespace(version_id):
689
raise InvalidRevisionId(version_id, self.filename)
690
if self.reserved_id(version_id):
691
raise errors.ReservedId(version_id)
692
if self.has_version(version_id):
693
raise RevisionAlreadyPresent(version_id, self.filename)
694
self._check_lines_not_unicode(lines)
695
self._check_lines_are_lines(lines)
697
def _add(self, version_id, lines, parents, delta, parent_texts):
698
"""Add a set of lines on top of version specified by parents.
700
If delta is true, compress the text as a line-delta against
703
Any versions not present will be converted into ghosts.
705
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
706
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
707
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
708
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
709
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
710
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
711
# +1383 0 8.0370 8.0370 +<len>
712
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
713
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
714
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
715
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
719
if parent_texts is None:
721
for parent in parents:
722
if not self.has_version(parent):
723
ghosts.append(parent)
725
present_parents.append(parent)
727
if delta and not len(present_parents):
730
digest = sha_strings(lines)
733
if lines[-1][-1] != '\n':
734
options.append('no-eol')
735
lines[-1] = lines[-1] + '\n'
737
if len(present_parents) and delta:
738
# To speed the extract of texts the delta chain is limited
739
# to a fixed number of deltas. This should minimize both
740
# I/O and the time spend applying deltas.
741
delta = self._check_should_delta(present_parents)
743
lines = self.factory.make(lines, version_id)
744
if delta or (self.factory.annotated and len(present_parents) > 0):
745
# Merge annotations from parent texts if so is needed.
746
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
747
delta, self.factory.annotated)
750
options.append('line-delta')
751
store_lines = self.factory.lower_line_delta(delta_hunks)
753
options.append('fulltext')
754
store_lines = self.factory.lower_fulltext(lines)
756
where, size = self._data.add_record(version_id, digest, store_lines)
757
self._index.add_version(version_id, options, where, size, parents)
760
def check(self, progress_bar=None):
761
"""See VersionedFile.check()."""
763
def _clone_text(self, new_version_id, old_version_id, parents):
764
"""See VersionedFile.clone_text()."""
765
# FIXME RBC 20060228 make fast by only inserting an index with null
767
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
769
def get_lines(self, version_id):
770
"""See VersionedFile.get_lines()."""
771
return self.get_line_list([version_id])[0]
773
def _get_record_map(self, version_ids):
774
"""Produce a dictionary of knit records.
776
The keys are version_ids, the values are tuples of (method, content,
778
method is the way the content should be applied.
779
content is a KnitContent object.
780
digest is the SHA1 digest of this version id after all steps are done
781
next is the build-parent of the version, i.e. the leftmost ancestor.
782
If the method is fulltext, next will be None.
784
position_map = self._get_components_positions(version_ids)
785
# c = component_id, m = method, p = position, s = size, n = next
786
records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
788
for component_id, content, digest in \
789
self._data.read_records_iter(records):
790
method, position, size, next = position_map[component_id]
791
record_map[component_id] = method, content, digest, next
795
def get_text(self, version_id):
796
"""See VersionedFile.get_text"""
797
return self.get_texts([version_id])[0]
799
def get_texts(self, version_ids):
800
return [''.join(l) for l in self.get_line_list(version_ids)]
802
def get_line_list(self, version_ids):
803
"""Return the texts of listed versions as a list of strings."""
804
for version_id in version_ids:
805
if self.reserved_id(version_id):
806
raise errors.ReservedId(version_id)
807
text_map, content_map = self._get_content_maps(version_ids)
808
return [text_map[v] for v in version_ids]
810
def _get_content_maps(self, version_ids):
811
"""Produce maps of text and KnitContents
813
:return: (text_map, content_map) where text_map contains the texts for
814
the requested versions and content_map contains the KnitContents.
815
Both dicts take version_ids as their keys.
817
for version_id in version_ids:
818
if not self.has_version(version_id):
819
raise RevisionNotPresent(version_id, self.filename)
820
record_map = self._get_record_map(version_ids)
825
for version_id in version_ids:
828
while cursor is not None:
829
method, data, digest, next = record_map[cursor]
830
components.append((cursor, method, data, digest))
831
if cursor in content_map:
836
for component_id, method, data, digest in reversed(components):
837
if component_id in content_map:
838
content = content_map[component_id]
840
version_idx = self._index.lookup(component_id)
841
if method == 'fulltext':
842
assert content is None
843
content = self.factory.parse_fulltext(data, version_idx)
844
elif method == 'line-delta':
845
delta = self.factory.parse_line_delta(data, version_idx)
846
content = content.copy()
847
content._lines = self._apply_delta(content._lines,
849
content_map[component_id] = content
851
if 'no-eol' in self._index.get_options(version_id):
852
content = content.copy()
853
line = content._lines[-1][1].rstrip('\n')
854
content._lines[-1] = (content._lines[-1][0], line)
855
final_content[version_id] = content
857
# digest here is the digest from the last applied component.
858
text = content.text()
859
if sha_strings(text) != digest:
860
raise KnitCorrupt(self.filename,
861
'sha-1 does not match %s' % version_id)
863
text_map[version_id] = text
864
return text_map, final_content
866
def iter_lines_added_or_present_in_versions(self, version_ids=None,
868
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
869
if version_ids is None:
870
version_ids = self.versions()
872
pb = progress.DummyProgress()
873
# we don't care about inclusions, the caller cares.
874
# but we need to setup a list of records to visit.
875
# we need version_id, position, length
876
version_id_records = []
877
requested_versions = set(version_ids)
878
# filter for available versions
879
for version_id in requested_versions:
880
if not self.has_version(version_id):
881
raise RevisionNotPresent(version_id, self.filename)
882
# get a in-component-order queue:
883
for version_id in self.versions():
884
if version_id in requested_versions:
885
data_pos, length = self._index.get_position(version_id)
886
version_id_records.append((version_id, data_pos, length))
888
total = len(version_id_records)
889
for version_idx, (version_id, data, sha_value) in \
890
enumerate(self._data.read_records_iter(version_id_records)):
891
pb.update('Walking content.', version_idx, total)
892
method = self._index.get_method(version_id)
893
version_idx = self._index.lookup(version_id)
895
assert method in ('fulltext', 'line-delta')
896
if method == 'fulltext':
897
line_iterator = self.factory.get_fulltext_content(data)
899
line_iterator = self.factory.get_linedelta_content(data)
900
for line in line_iterator:
903
pb.update('Walking content.', total, total)
905
def num_versions(self):
906
"""See VersionedFile.num_versions()."""
907
return self._index.num_versions()
909
__len__ = num_versions
911
def annotate_iter(self, version_id):
912
"""See VersionedFile.annotate_iter."""
913
content = self._get_content(version_id)
914
for origin, text in content.annotate_iter():
917
def get_parents(self, version_id):
918
"""See VersionedFile.get_parents."""
921
# 52554 calls in 1264 872 internal down from 3674
923
return self._index.get_parents(version_id)
925
raise RevisionNotPresent(version_id, self.filename)
927
def get_parents_with_ghosts(self, version_id):
928
"""See VersionedFile.get_parents."""
930
return self._index.get_parents_with_ghosts(version_id)
932
raise RevisionNotPresent(version_id, self.filename)
934
def get_ancestry(self, versions):
935
"""See VersionedFile.get_ancestry."""
936
if isinstance(versions, basestring):
937
versions = [versions]
940
return self._index.get_ancestry(versions)
942
def get_ancestry_with_ghosts(self, versions):
943
"""See VersionedFile.get_ancestry_with_ghosts."""
944
if isinstance(versions, basestring):
945
versions = [versions]
948
return self._index.get_ancestry_with_ghosts(versions)
950
#@deprecated_method(zero_eight)
951
def walk(self, version_ids):
952
"""See VersionedFile.walk."""
953
# We take the short path here, and extract all relevant texts
954
# and put them in a weave and let that do all the work. Far
955
# from optimal, but is much simpler.
956
# FIXME RB 20060228 this really is inefficient!
957
from bzrlib.weave import Weave
959
w = Weave(self.filename)
960
ancestry = self.get_ancestry(version_ids)
961
sorted_graph = topo_sort(self._index.get_graph())
962
version_list = [vid for vid in sorted_graph if vid in ancestry]
964
for version_id in version_list:
965
lines = self.get_lines(version_id)
966
w.add_lines(version_id, self.get_parents(version_id), lines)
968
for lineno, insert_id, dset, line in w.walk(version_ids):
969
yield lineno, insert_id, dset, line
971
def plan_merge(self, ver_a, ver_b):
972
"""See VersionedFile.plan_merge."""
973
ancestors_b = set(self.get_ancestry(ver_b))
974
def status_a(revision, text):
975
if revision in ancestors_b:
976
return 'killed-b', text
980
ancestors_a = set(self.get_ancestry(ver_a))
981
def status_b(revision, text):
982
if revision in ancestors_a:
983
return 'killed-a', text
987
annotated_a = self.annotate(ver_a)
988
annotated_b = self.annotate(ver_b)
989
plain_a = [t for (a, t) in annotated_a]
990
plain_b = [t for (a, t) in annotated_b]
991
blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()
994
for ai, bi, l in blocks:
995
# process all mismatched sections
996
# (last mismatched section is handled because blocks always
997
# includes a 0-length last block)
998
for revision, text in annotated_a[a_cur:ai]:
999
yield status_a(revision, text)
1000
for revision, text in annotated_b[b_cur:bi]:
1001
yield status_b(revision, text)
1003
# and now the matched section
1006
for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):
1007
assert text_a == text_b
1008
yield "unchanged", text_a
1011
class _KnitComponentFile(object):
1012
"""One of the files used to implement a knit database"""
1014
def __init__(self, transport, filename, mode, file_mode=None,
1015
create_parent_dir=False, dir_mode=None):
1016
self._transport = transport
1017
self._filename = filename
1019
self._file_mode = file_mode
1020
self._dir_mode = dir_mode
1021
self._create_parent_dir = create_parent_dir
1022
self._need_to_create = False
1024
def _full_path(self):
1025
"""Return the full path to this file."""
1026
return self._transport.base + self._filename
1028
def check_header(self, fp):
1029
line = fp.readline()
1031
# An empty file can actually be treated as though the file doesn't
1033
raise errors.NoSuchFile(self._full_path())
1034
if line != self.HEADER:
1035
raise KnitHeaderError(badline=line,
1036
filename=self._transport.abspath(self._filename))
1039
"""Commit is a nop."""
1042
return '%s(%s)' % (self.__class__.__name__, self._filename)
1045
class _KnitIndex(_KnitComponentFile):
1046
"""Manages knit index file.
1048
The index is already kept in memory and read on startup, to enable
1049
fast lookups of revision information. The cursor of the index
1050
file is always pointing to the end, making it easy to append
1053
_cache is a cache for fast mapping from version id to a Index
1056
_history is a cache for fast mapping from indexes to version ids.
1058
The index data format is dictionary compressed when it comes to
1059
parent references; a index entry may only have parents that with a
1060
lover index number. As a result, the index is topological sorted.
1062
Duplicate entries may be written to the index for a single version id
1063
if this is done then the latter one completely replaces the former:
1064
this allows updates to correct version and parent information.
1065
Note that the two entries may share the delta, and that successive
1066
annotations and references MUST point to the first entry.
1068
The index file on disc contains a header, followed by one line per knit
1069
record. The same revision can be present in an index file more than once.
1070
The first occurrence gets assigned a sequence number starting from 0.
1072
The format of a single line is
1073
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
1074
REVISION_ID is a utf8-encoded revision id
1075
FLAGS is a comma separated list of flags about the record. Values include
1076
no-eol, line-delta, fulltext.
1077
BYTE_OFFSET is the ascii representation of the byte offset in the data file
1078
that the the compressed data starts at.
1079
LENGTH is the ascii representation of the length of the data file.
1080
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
1082
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
1083
revision id already in the knit that is a parent of REVISION_ID.
1084
The ' :' marker is the end of record marker.
1087
when a write is interrupted to the index file, it will result in a line
1088
that does not end in ' :'. If the ' :' is not present at the end of a line,
1089
or at the end of the file, then the record that is missing it will be
1090
ignored by the parser.
1092
When writing new records to the index file, the data is preceded by '\n'
1093
to ensure that records always start on new lines even if the last write was
1094
interrupted. As a result its normal for the last line in the index to be
1095
missing a trailing newline. One can be added with no harmful effects.
1098
HEADER = "# bzr knit index 8\n"
1100
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
1101
# __slots__ = ['_cache', '_history', '_transport', '_filename']
1103
def _cache_version(self, version_id, options, pos, size, parents):
1104
"""Cache a version record in the history array and index cache.
1106
This is inlined into _load_data for performance. KEEP IN SYNC.
1107
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
1110
# only want the _history index to reference the 1st index entry
1112
if version_id not in self._cache:
1113
index = len(self._history)
1114
self._history.append(version_id)
1116
index = self._cache[version_id][5]
1117
self._cache[version_id] = (version_id,
1124
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1125
create_parent_dir=False, delay_create=False, dir_mode=None):
1126
_KnitComponentFile.__init__(self, transport, filename, mode,
1127
file_mode=file_mode,
1128
create_parent_dir=create_parent_dir,
1131
# position in _history is the 'official' index for a revision
1132
# but the values may have come from a newer entry.
1133
# so - wc -l of a knit index is != the number of unique names
1136
decode_utf8 = cache_utf8.decode
1137
pb = ui.ui_factory.nested_progress_bar()
1139
pb.update('read knit index', 0, 1)
1141
fp = self._transport.get(self._filename)
1143
# _load_data may raise NoSuchFile if the target knit is
1149
if mode != 'w' or not create:
1152
self._need_to_create = True
1154
self._transport.put_bytes_non_atomic(
1155
self._filename, self.HEADER, mode=self._file_mode)
1157
pb.update('read knit index', 1, 1)
1160
def _load_data(self, fp):
1162
history = self._history
1163
decode_utf8 = cache_utf8.decode
1165
self.check_header(fp)
1166
# readlines reads the whole file at once:
1167
# bad for transports like http, good for local disk
1168
# we save 60 ms doing this one change (
1169
# from calling readline each time to calling
1171
# probably what we want for nice behaviour on
1172
# http is a incremental readlines that yields, or
1173
# a check for local vs non local indexes,
1174
history_top = len(history) - 1
1175
for line in fp.readlines():
1177
if len(rec) < 5 or rec[-1] != ':':
1179
# FIXME: in the future we should determine if its a
1180
# short write - and ignore it
1181
# or a different failure, and raise. RBC 20060407
1185
for value in rec[4:-1]:
1187
# uncompressed reference
1188
parents.append(decode_utf8(value[1:]))
1190
parents.append(history[int(value)])
1192
version_id, options, pos, size = rec[:4]
1193
version_id = decode_utf8(version_id)
1195
# See self._cache_version
1196
# only want the _history index to reference the 1st
1197
# index entry for version_id
1198
if version_id not in cache:
1201
history.append(version_id)
1203
index = cache[version_id][5]
1204
cache[version_id] = (version_id,
1210
# end self._cache_version
1212
def get_graph(self):
1213
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1215
def get_ancestry(self, versions):
1216
"""See VersionedFile.get_ancestry."""
1217
# get a graph of all the mentioned versions:
1219
pending = set(versions)
1222
version = pending.pop()
1225
parents = [p for p in cache[version][4] if p in cache]
1227
raise RevisionNotPresent(version, self._filename)
1228
# if not completed and not a ghost
1229
pending.update([p for p in parents if p not in graph])
1230
graph[version] = parents
1231
return topo_sort(graph.items())
1233
def get_ancestry_with_ghosts(self, versions):
1234
"""See VersionedFile.get_ancestry_with_ghosts."""
1235
# get a graph of all the mentioned versions:
1236
self.check_versions_present(versions)
1239
pending = set(versions)
1241
version = pending.pop()
1243
parents = cache[version][4]
1249
pending.update([p for p in parents if p not in graph])
1250
graph[version] = parents
1251
return topo_sort(graph.items())
1253
def num_versions(self):
1254
return len(self._history)
1256
__len__ = num_versions
1258
def get_versions(self):
1259
return self._history
1261
def idx_to_name(self, idx):
1262
return self._history[idx]
1264
def lookup(self, version_id):
1265
assert version_id in self._cache
1266
return self._cache[version_id][5]
1268
def _version_list_to_index(self, versions):
1269
encode_utf8 = cache_utf8.encode
1272
for version in versions:
1273
if version in cache:
1274
# -- inlined lookup() --
1275
result_list.append(str(cache[version][5]))
1276
# -- end lookup () --
1278
result_list.append('.' + encode_utf8(version))
1279
return ' '.join(result_list)
1281
def add_version(self, version_id, options, pos, size, parents):
1282
"""Add a version record to the index."""
1283
self.add_versions(((version_id, options, pos, size, parents),))
1285
def add_versions(self, versions):
1286
"""Add multiple versions to the index.
1288
:param versions: a list of tuples:
1289
(version_id, options, pos, size, parents).
1292
encode_utf8 = cache_utf8.encode
1293
orig_history = self._history[:]
1294
orig_cache = self._cache.copy()
1297
for version_id, options, pos, size, parents in versions:
1298
line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),
1302
self._version_list_to_index(parents))
1303
assert isinstance(line, str), \
1304
'content must be utf-8 encoded: %r' % (line,)
1306
self._cache_version(version_id, options, pos, size, parents)
1307
if not self._need_to_create:
1308
self._transport.append_bytes(self._filename, ''.join(lines))
1311
sio.write(self.HEADER)
1312
sio.writelines(lines)
1314
self._transport.put_file_non_atomic(self._filename, sio,
1315
create_parent_dir=self._create_parent_dir,
1316
mode=self._file_mode,
1317
dir_mode=self._dir_mode)
1318
self._need_to_create = False
1320
# If any problems happen, restore the original values and re-raise
1321
self._history = orig_history
1322
self._cache = orig_cache
1325
def has_version(self, version_id):
1326
"""True if the version is in the index."""
1327
return version_id in self._cache
1329
def get_position(self, version_id):
1330
"""Return data position and size of specified version."""
1331
entry = self._cache[version_id]
1332
return entry[2], entry[3]
1334
def get_method(self, version_id):
1335
"""Return compression method of specified version."""
1336
options = self._cache[version_id][1]
1337
if 'fulltext' in options:
1340
if 'line-delta' not in options:
1341
raise errors.KnitIndexUnknownMethod(self._full_path(), options)
1344
def get_options(self, version_id):
1345
return self._cache[version_id][1]
1347
def get_parents(self, version_id):
1348
"""Return parents of specified version ignoring ghosts."""
1349
return [parent for parent in self._cache[version_id][4]
1350
if parent in self._cache]
1352
def get_parents_with_ghosts(self, version_id):
1353
"""Return parents of specified version with ghosts."""
1354
return self._cache[version_id][4]
1356
def check_versions_present(self, version_ids):
1357
"""Check that all specified versions are present."""
1359
for version_id in version_ids:
1360
if version_id not in cache:
1361
raise RevisionNotPresent(version_id, self._filename)
1364
class _KnitData(_KnitComponentFile):
1365
"""Contents of the knit data file"""
1367
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1368
create_parent_dir=False, delay_create=False,
1370
_KnitComponentFile.__init__(self, transport, filename, mode,
1371
file_mode=file_mode,
1372
create_parent_dir=create_parent_dir,
1374
self._checked = False
1375
# TODO: jam 20060713 conceptually, this could spill to disk
1376
# if the cached size gets larger than a certain amount
1377
# but it complicates the model a bit, so for now just use
1378
# a simple dictionary
1380
self._do_cache = False
1383
self._need_to_create = create
1385
self._transport.put_bytes_non_atomic(self._filename, '',
1386
mode=self._file_mode)
1388
def enable_cache(self):
1389
"""Enable caching of reads."""
1390
self._do_cache = True
1392
def clear_cache(self):
1393
"""Clear the record cache."""
1394
self._do_cache = False
1397
def _open_file(self):
1399
return self._transport.get(self._filename)
1404
def _record_to_data(self, version_id, digest, lines):
1405
"""Convert version_id, digest, lines into a raw data block.
1407
:return: (len, a StringIO instance with the raw data ready to read.)
1410
data_file = GzipFile(None, mode='wb', fileobj=sio)
1412
version_id_utf8 = cache_utf8.encode(version_id)
1413
data_file.writelines(chain(
1414
["version %s %d %s\n" % (version_id_utf8,
1418
["end %s\n" % version_id_utf8]))
1425
def add_raw_record(self, raw_data):
1426
"""Append a prepared record to the data file.
1428
:return: the offset in the data file raw_data was written.
1430
assert isinstance(raw_data, str), 'data must be plain bytes'
1431
if not self._need_to_create:
1432
return self._transport.append_bytes(self._filename, raw_data)
1434
self._transport.put_bytes_non_atomic(self._filename, raw_data,
1435
create_parent_dir=self._create_parent_dir,
1436
mode=self._file_mode,
1437
dir_mode=self._dir_mode)
1438
self._need_to_create = False
1441
def add_record(self, version_id, digest, lines):
1442
"""Write new text record to disk. Returns the position in the
1443
file where it was written."""
1444
size, sio = self._record_to_data(version_id, digest, lines)
1446
if not self._need_to_create:
1447
start_pos = self._transport.append_file(self._filename, sio)
1449
self._transport.put_file_non_atomic(self._filename, sio,
1450
create_parent_dir=self._create_parent_dir,
1451
mode=self._file_mode,
1452
dir_mode=self._dir_mode)
1453
self._need_to_create = False
1456
self._cache[version_id] = sio.getvalue()
1457
return start_pos, size
1459
def _parse_record_header(self, version_id, raw_data):
1460
"""Parse a record header for consistency.
1462
:return: the header and the decompressor stream.
1463
as (stream, header_record)
1465
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1466
rec = self._check_header(version_id, df.readline())
1469
def _check_header(self, version_id, line):
1472
raise KnitCorrupt(self._filename,
1473
'unexpected number of elements in record header')
1474
if cache_utf8.decode(rec[1]) != version_id:
1475
raise KnitCorrupt(self._filename,
1476
'unexpected version, wanted %r, got %r'
1477
% (version_id, rec[1]))
1480
def _parse_record(self, version_id, data):
1482
# 4168 calls in 2880 217 internal
1483
# 4168 calls to _parse_record_header in 2121
1484
# 4168 calls to readlines in 330
1485
df = GzipFile(mode='rb', fileobj=StringIO(data))
1487
record_contents = df.readlines()
1488
header = record_contents.pop(0)
1489
rec = self._check_header(version_id, header)
1491
last_line = record_contents.pop()
1492
assert len(record_contents) == int(rec[2])
1493
if last_line != 'end %s\n' % rec[1]:
1494
raise KnitCorrupt(self._filename,
1495
'unexpected version end line %r, wanted %r'
1496
% (last_line, version_id))
1498
return record_contents, rec[3]
1500
def read_records_iter_raw(self, records):
1501
"""Read text records from data file and yield raw data.
1503
This unpacks enough of the text record to validate the id is
1504
as expected but thats all.
1506
# setup an iterator of the external records:
1507
# uses readv so nice and fast we hope.
1509
# grab the disk data needed.
1511
# Don't check _cache if it is empty
1512
needed_offsets = [(pos, size) for version_id, pos, size
1514
if version_id not in self._cache]
1516
needed_offsets = [(pos, size) for version_id, pos, size
1519
raw_records = self._transport.readv(self._filename, needed_offsets)
1521
for version_id, pos, size in records:
1522
if version_id in self._cache:
1523
# This data has already been validated
1524
data = self._cache[version_id]
1526
pos, data = raw_records.next()
1528
self._cache[version_id] = data
1530
# validate the header
1531
df, rec = self._parse_record_header(version_id, data)
1533
yield version_id, data
1535
def read_records_iter(self, records):
1536
"""Read text records from data file and yield result.
1538
The result will be returned in whatever is the fastest to read.
1539
Not by the order requested. Also, multiple requests for the same
1540
record will only yield 1 response.
1541
:param records: A list of (version_id, pos, len) entries
1542
:return: Yields (version_id, contents, digest) in the order
1543
read, not the order requested
1549
# Skip records we have alread seen
1550
yielded_records = set()
1551
needed_records = set()
1552
for record in records:
1553
if record[0] in self._cache:
1554
if record[0] in yielded_records:
1556
yielded_records.add(record[0])
1557
data = self._cache[record[0]]
1558
content, digest = self._parse_record(record[0], data)
1559
yield (record[0], content, digest)
1561
needed_records.add(record)
1562
needed_records = sorted(needed_records, key=operator.itemgetter(1))
1564
needed_records = sorted(set(records), key=operator.itemgetter(1))
1566
if not needed_records:
1569
# The transport optimizes the fetching as well
1570
# (ie, reads continuous ranges.)
1571
readv_response = self._transport.readv(self._filename,
1572
[(pos, size) for version_id, pos, size in needed_records])
1574
for (version_id, pos, size), (pos, data) in \
1575
izip(iter(needed_records), readv_response):
1576
content, digest = self._parse_record(version_id, data)
1578
self._cache[version_id] = data
1579
yield version_id, content, digest
1581
def read_records(self, records):
1582
"""Read records into a dictionary."""
1584
for record_id, content, digest in \
1585
self.read_records_iter(records):
1586
components[record_id] = (content, digest)
1590
class InterKnit(InterVersionedFile):
1591
"""Optimised code paths for knit to knit operations."""
1593
_matching_file_from_factory = KnitVersionedFile
1594
_matching_file_to_factory = KnitVersionedFile
1597
def is_compatible(source, target):
1598
"""Be compatible with knits. """
1600
return (isinstance(source, KnitVersionedFile) and
1601
isinstance(target, KnitVersionedFile))
1602
except AttributeError:
1605
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1606
"""See InterVersionedFile.join."""
1607
assert isinstance(self.source, KnitVersionedFile)
1608
assert isinstance(self.target, KnitVersionedFile)
1610
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1615
pb = ui.ui_factory.nested_progress_bar()
1617
version_ids = list(version_ids)
1618
if None in version_ids:
1619
version_ids.remove(None)
1621
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1622
this_versions = set(self.target._index.get_versions())
1623
needed_versions = self.source_ancestry - this_versions
1624
cross_check_versions = self.source_ancestry.intersection(this_versions)
1625
mismatched_versions = set()
1626
for version in cross_check_versions:
1627
# scan to include needed parents.
1628
n1 = set(self.target.get_parents_with_ghosts(version))
1629
n2 = set(self.source.get_parents_with_ghosts(version))
1631
# FIXME TEST this check for cycles being introduced works
1632
# the logic is we have a cycle if in our graph we are an
1633
# ancestor of any of the n2 revisions.
1639
parent_ancestors = self.source.get_ancestry(parent)
1640
if version in parent_ancestors:
1641
raise errors.GraphCycleError([parent, version])
1642
# ensure this parent will be available later.
1643
new_parents = n2.difference(n1)
1644
needed_versions.update(new_parents.difference(this_versions))
1645
mismatched_versions.add(version)
1647
if not needed_versions and not mismatched_versions:
1649
full_list = topo_sort(self.source.get_graph())
1651
version_list = [i for i in full_list if (not self.target.has_version(i)
1652
and i in needed_versions)]
1656
copy_queue_records = []
1658
for version_id in version_list:
1659
options = self.source._index.get_options(version_id)
1660
parents = self.source._index.get_parents_with_ghosts(version_id)
1661
# check that its will be a consistent copy:
1662
for parent in parents:
1663
# if source has the parent, we must :
1664
# * already have it or
1665
# * have it scheduled already
1666
# otherwise we don't care
1667
assert (self.target.has_version(parent) or
1668
parent in copy_set or
1669
not self.source.has_version(parent))
1670
data_pos, data_size = self.source._index.get_position(version_id)
1671
copy_queue_records.append((version_id, data_pos, data_size))
1672
copy_queue.append((version_id, options, parents))
1673
copy_set.add(version_id)
1675
# data suck the join:
1677
total = len(version_list)
1680
for (version_id, raw_data), \
1681
(version_id2, options, parents) in \
1682
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1684
assert version_id == version_id2, 'logic error, inconsistent results'
1686
pb.update("Joining knit", count, total)
1687
raw_records.append((version_id, options, parents, len(raw_data)))
1688
raw_datum.append(raw_data)
1689
self.target._add_raw_records(raw_records, ''.join(raw_datum))
1691
for version in mismatched_versions:
1692
# FIXME RBC 20060309 is this needed?
1693
n1 = set(self.target.get_parents_with_ghosts(version))
1694
n2 = set(self.source.get_parents_with_ghosts(version))
1695
# write a combined record to our history preserving the current
1696
# parents as first in the list
1697
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1698
self.target.fix_parents(version, new_parents)
1704
InterVersionedFile.register_optimiser(InterKnit)
1707
class WeaveToKnit(InterVersionedFile):
1708
"""Optimised code paths for weave to knit operations."""
1710
_matching_file_from_factory = bzrlib.weave.WeaveFile
1711
_matching_file_to_factory = KnitVersionedFile
1714
def is_compatible(source, target):
1715
"""Be compatible with weaves to knits."""
1717
return (isinstance(source, bzrlib.weave.Weave) and
1718
isinstance(target, KnitVersionedFile))
1719
except AttributeError:
1722
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1723
"""See InterVersionedFile.join."""
1724
assert isinstance(self.source, bzrlib.weave.Weave)
1725
assert isinstance(self.target, KnitVersionedFile)
1727
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1732
pb = ui.ui_factory.nested_progress_bar()
1734
version_ids = list(version_ids)
1736
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1737
this_versions = set(self.target._index.get_versions())
1738
needed_versions = self.source_ancestry - this_versions
1739
cross_check_versions = self.source_ancestry.intersection(this_versions)
1740
mismatched_versions = set()
1741
for version in cross_check_versions:
1742
# scan to include needed parents.
1743
n1 = set(self.target.get_parents_with_ghosts(version))
1744
n2 = set(self.source.get_parents(version))
1745
# if all of n2's parents are in n1, then its fine.
1746
if n2.difference(n1):
1747
# FIXME TEST this check for cycles being introduced works
1748
# the logic is we have a cycle if in our graph we are an
1749
# ancestor of any of the n2 revisions.
1755
parent_ancestors = self.source.get_ancestry(parent)
1756
if version in parent_ancestors:
1757
raise errors.GraphCycleError([parent, version])
1758
# ensure this parent will be available later.
1759
new_parents = n2.difference(n1)
1760
needed_versions.update(new_parents.difference(this_versions))
1761
mismatched_versions.add(version)
1763
if not needed_versions and not mismatched_versions:
1765
full_list = topo_sort(self.source.get_graph())
1767
version_list = [i for i in full_list if (not self.target.has_version(i)
1768
and i in needed_versions)]
1772
total = len(version_list)
1773
for version_id in version_list:
1774
pb.update("Converting to knit", count, total)
1775
parents = self.source.get_parents(version_id)
1776
# check that its will be a consistent copy:
1777
for parent in parents:
1778
# if source has the parent, we must already have it
1779
assert (self.target.has_version(parent))
1780
self.target.add_lines(
1781
version_id, parents, self.source.get_lines(version_id))
1784
for version in mismatched_versions:
1785
# FIXME RBC 20060309 is this needed?
1786
n1 = set(self.target.get_parents_with_ghosts(version))
1787
n2 = set(self.source.get_parents(version))
1788
# write a combined record to our history preserving the current
1789
# parents as first in the list
1790
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1791
self.target.fix_parents(version, new_parents)
1797
InterVersionedFile.register_optimiser(WeaveToKnit)
1800
class KnitSequenceMatcher(difflib.SequenceMatcher):
1801
"""Knit tuned sequence matcher.
1803
This is based on profiling of difflib which indicated some improvements
1804
for our usage pattern.
1807
def find_longest_match(self, alo, ahi, blo, bhi):
1808
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1810
If isjunk is not defined:
1812
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1813
alo <= i <= i+k <= ahi
1814
blo <= j <= j+k <= bhi
1815
and for all (i',j',k') meeting those conditions,
1818
and if i == i', j <= j'
1820
In other words, of all maximal matching blocks, return one that
1821
starts earliest in a, and of all those maximal matching blocks that
1822
start earliest in a, return the one that starts earliest in b.
1824
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1825
>>> s.find_longest_match(0, 5, 0, 9)
1828
If isjunk is defined, first the longest matching block is
1829
determined as above, but with the additional restriction that no
1830
junk element appears in the block. Then that block is extended as
1831
far as possible by matching (only) junk elements on both sides. So
1832
the resulting block never matches on junk except as identical junk
1833
happens to be adjacent to an "interesting" match.
1835
Here's the same example as before, but considering blanks to be
1836
junk. That prevents " abcd" from matching the " abcd" at the tail
1837
end of the second sequence directly. Instead only the "abcd" can
1838
match, and matches the leftmost "abcd" in the second sequence:
1840
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1841
>>> s.find_longest_match(0, 5, 0, 9)
1844
If no blocks match, return (alo, blo, 0).
1846
>>> s = SequenceMatcher(None, "ab", "c")
1847
>>> s.find_longest_match(0, 2, 0, 1)
1851
# CAUTION: stripping common prefix or suffix would be incorrect.
1855
# Longest matching block is "ab", but if common prefix is
1856
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1857
# strip, so ends up claiming that ab is changed to acab by
1858
# inserting "ca" in the middle. That's minimal but unintuitive:
1859
# "it's obvious" that someone inserted "ac" at the front.
1860
# Windiff ends up at the same place as diff, but by pairing up
1861
# the unique 'b's and then matching the first two 'a's.
1863
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1864
besti, bestj, bestsize = alo, blo, 0
1865
# find longest junk-free match
1866
# during an iteration of the loop, j2len[j] = length of longest
1867
# junk-free match ending with a[i-1] and b[j]
1871
for i in xrange(alo, ahi):
1872
# look at all instances of a[i] in b; note that because
1873
# b2j has no junk keys, the loop is skipped if a[i] is junk
1874
j2lenget = j2len.get
1877
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
1878
# following improvement
1879
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1880
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1881
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1883
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1884
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1885
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1897
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1899
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1902
# Extend the best by non-junk elements on each end. In particular,
1903
# "popular" non-junk elements aren't in b2j, which greatly speeds
1904
# the inner loop above, but also means "the best" match so far
1905
# doesn't contain any junk *or* popular non-junk elements.
1906
while besti > alo and bestj > blo and \
1907
not isbjunk(b[bestj-1]) and \
1908
a[besti-1] == b[bestj-1]:
1909
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1910
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1911
not isbjunk(b[bestj+bestsize]) and \
1912
a[besti+bestsize] == b[bestj+bestsize]:
1915
# Now that we have a wholly interesting match (albeit possibly
1916
# empty!), we may as well suck up the matching junk on each
1917
# side of it too. Can't think of a good reason not to, and it
1918
# saves post-processing the (possibly considerable) expense of
1919
# figuring out what to do with it. In the case of an empty
1920
# interesting match, this is clearly the right thing to do,
1921
# because no other kind of match is possible in the regions.
1922
while besti > alo and bestj > blo and \
1923
isbjunk(b[bestj-1]) and \
1924
a[besti-1] == b[bestj-1]:
1925
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1926
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1927
isbjunk(b[bestj+bestsize]) and \
1928
a[besti+bestsize] == b[bestj+bestsize]:
1929
bestsize = bestsize + 1
1931
return besti, bestj, bestsize