3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# making _extract build and return a list, rather than being a generator
31
# with python -O, r923 does 2000 versions in 36.87s
33
# with optimizations to avoid mutating lists - 35.75! I guess copying
34
# all the elements every time costs more than the small manipulations.
35
# a surprisingly small change.
37
# r931, which avoids using a generator for extract, does 36.98s
39
# with memoized inclusions, takes 41.49s; not very good
41
# with slots, takes 37.35s; without takes 39.16, a bit surprising
43
# with the delta calculation mixed in with the add method, rather than
44
# separated, takes 36.78s
46
# with delta folded in and mutation of the list, 36.13s
48
# with all this and simplification of add code, 33s
51
# TODO: Perhaps have copy method for Weave instances?
53
# XXX: If we do weaves this way, will a merge still behave the same
54
# way if it's done in a different order? That's a pretty desirable
57
# TODO: Nothing here so far assumes the lines are really \n newlines,
58
# rather than being split up in some other way. We could accomodate
59
# binaries, perhaps by naively splitting on \n or perhaps using
60
# something like a rolling checksum.
62
# TODO: Track version names as well as indexes.
64
# TODO: End marker for each version so we can stop reading?
66
# TODO: Check that no insertion occurs inside a deletion that was
67
# active in the version of the insertion.
69
# TODO: In addition to the SHA-1 check, perhaps have some code that
70
# checks structural constraints of the weave: ie that insertions are
71
# properly nested, that there is no text outside of an insertion, that
72
# insertions or deletions are not repeated, etc.
74
# TODO: Make the info command just show info, not extract everything:
75
# it can be much faster.
77
# TODO: Perhaps use long integers as sets instead of set objects; may
80
# TODO: Parallel-extract that passes back each line along with a
81
# description of which revisions include it. Nice for checking all
87
class WeaveError(Exception):
88
"""Exception in processing weave"""
91
class WeaveFormatError(WeaveError):
92
"""Weave invariant violated"""
96
"""weave - versioned text file storage.
98
A Weave manages versions of line-based text files, keeping track
99
of the originating version for each line.
101
To clients the "lines" of the file are represented as a list of strings.
102
These strings will typically have terminal newline characters, but
103
this is not required. In particular files commonly do not have a newline
104
at the end of the file.
106
Texts can be identified in either of two ways:
108
* a nonnegative index number.
110
* a version-id string.
112
Typically the index number will be valid only inside this weave and
113
the version-id is used to reference it in the larger world.
115
The weave is represented as a list mixing edit instructions and
116
literal text. Each entry in _l can be either a string (or
117
unicode), or a tuple. If a string, it means that the given line
118
should be output in the currently active revisions.
120
If a tuple, it gives a processing instruction saying in which
121
revisions the enclosed lines are active. The tuple has the form
122
(instruction, version).
124
The instruction can be '{' or '}' for an insertion block, and '['
125
and ']' for a deletion block respectively. The version is the
126
integer version index. There is no replace operator, only deletes
131
* A later version can delete lines that were introduced by any
132
number of ancestor versions; this implies that deletion
133
instructions can span insertion blocks without regard to the
134
insertion block's nesting.
136
* Similarly, deletions need not be properly nested with regard to
137
each other, because they might have been generated by
138
independent revisions.
140
* Insertions are always made by inserting a new bracketed block
141
into a single point in the previous weave. This implies they
142
can nest but not overlap, and the nesting must always have later
143
insertions on the inside.
145
* It doesn't seem very useful to have an active insertion
146
inside an inactive insertion, but it might happen.
148
* Therefore, all instructions are always"considered"; that
149
is passed onto and off the stack. An outer inactive block
150
doesn't disable an inner block.
152
* Lines are enabled if the most recent enclosing insertion is
153
active and none of the enclosing deletions are active.
155
* There is no point having a deletion directly inside its own
156
insertion; you might as well just not write it. And there
157
should be no way to get an earlier version deleting a later
164
List of parents, indexed by version number.
165
It is only necessary to store the minimal set of parents for
166
each version; the parent's parents are implied.
169
List of hex SHA-1 of each version, or None if not recorded.
172
## __slots__ = ['_l', '_v', '_sha1s']
180
def __eq__(self, other):
181
if not isinstance(other, Weave):
183
return self._v == other._v \
184
and self._l == other._l
187
def __ne__(self, other):
188
return not self.__eq__(other)
191
def add(self, parents, text):
192
"""Add a single text on top of the weave.
194
Returns the index number of the newly added version.
197
List or set of direct parent version numbers.
200
Sequence of lines to be added in the new version."""
202
self._check_versions(parents)
203
## self._check_lines(text)
204
new_version = len(self._v)
212
# if we abort after here the weave will be corrupt
213
self._v.append(frozenset(parents))
214
self._sha1s.append(sha1)
218
# special case; adding with no parents revision; can do
219
# this more quickly by just appending unconditionally.
220
# even more specially, if we're adding an empty text we
221
# need do nothing at all.
223
self._l.append(('{', new_version))
225
self._l.append(('}', new_version))
229
if len(parents) == 1 and sha1 == self._sha1s[parents[0]]:
230
# special case: same as the single parent
234
ancestors = self.inclusions(parents)
238
# basis a list of (origin, lineno, line)
241
for origin, lineno, line in self._extract(ancestors):
242
basis_lineno.append(lineno)
243
basis_lines.append(line)
245
# another small special case: a merge, producing the same text as auto-merge
246
if text == basis_lines:
249
# add a sentinal, because we can also match against the final line
250
basis_lineno.append(len(self._l))
252
# XXX: which line of the weave should we really consider
253
# matches the end of the file? the current code says it's the
254
# last line of the weave?
256
#print 'basis_lines:', basis_lines
257
#print 'new_lines: ', lines
259
from difflib import SequenceMatcher
260
s = SequenceMatcher(None, basis_lines, text)
262
# offset gives the number of lines that have been inserted
263
# into the weave up to the current point; if the original edit instruction
264
# says to change line A then we actually change (A+offset)
267
for tag, i1, i2, j1, j2 in s.get_opcodes():
268
# i1,i2 are given in offsets within basis_lines; we need to map them
269
# back to offsets within the entire weave
270
#print 'raw match', tag, i1, i2, j1, j2
274
i1 = basis_lineno[i1]
275
i2 = basis_lineno[i2]
277
assert 0 <= i1 <= i2 <= len(old_l)
278
assert 0 <= j1 <= j2 <= len(text)
280
#print tag, i1, i2, j1, j2
282
# the deletion and insertion are handled separately.
283
# first delete the region.
285
self._l.insert(i1+offset, ('[', new_version))
286
self._l.insert(i2+offset+1, (']', new_version))
290
# there may have been a deletion spanning up to
291
# i2; we want to insert after this region to make sure
292
# we don't destroy ourselves
294
self._l[i:i] = ([('{', new_version)]
296
+ [('}', new_version)])
297
offset += 2 + (j2 - j1)
302
def inclusions(self, versions):
303
"""Return set of all ancestors of given version(s)."""
309
# include all its parents
314
raise ValueError("version %d not present in weave" % v)
317
def minimal_parents(self, version):
318
"""Find the minimal set of parents for the version."""
319
included = self._v[version]
324
li.sort(reverse=True)
332
gotit.update(self.inclusions(pv))
334
assert mininc[0] >= 0
335
assert mininc[-1] < version
340
def _check_lines(self, text):
341
if not isinstance(text, list):
342
raise ValueError("text should be a list, not %s" % type(text))
345
if not isinstance(l, basestring):
346
raise ValueError("text line should be a string or unicode, not %s"
351
def _check_versions(self, indexes):
352
"""Check everything in the sequence of indexes is valid"""
357
raise IndexError("invalid version number %r" % i)
360
def annotate(self, index):
361
return list(self.annotate_iter(index))
364
def annotate_iter(self, version):
365
"""Yield list of (index-id, line) pairs for the specified version.
367
The index indicates when the line originated in the weave."""
368
for origin, lineno, text in self._extract([version]):
376
(lineno, insert, deletes, text)
377
for each literal line.
383
lineno = 0 # line of weave, 0-based
386
if isinstance(l, tuple):
399
raise WeaveFormatError('unexpected instruction %r'
402
assert isinstance(l, basestring)
404
yield lineno, istack[-1], dset, l
409
def _extract(self, versions):
410
"""Yield annotation of lines in included set.
412
Yields a sequence of tuples (origin, lineno, text), where
413
origin is the origin version, lineno the index in the weave,
414
and text the text of the line.
416
The set typically but not necessarily corresponds to a version.
418
included = self.inclusions(versions)
423
lineno = 0 # line of weave, 0-based
429
WFE = WeaveFormatError
432
if isinstance(l, tuple):
436
assert v not in istack
451
assert isinstance(l, basestring)
453
isactive = (not dset) and istack and (istack[-1] in included)
455
result.append((istack[-1], lineno, l))
459
raise WFE("unclosed insertion blocks at end of weave",
462
raise WFE("unclosed deletion blocks at end of weave",
469
def get_iter(self, version):
470
"""Yield lines for the specified version."""
471
for origin, lineno, line in self._extract([version]):
475
def get(self, index):
476
return list(self.get_iter(index))
479
def mash_iter(self, included):
480
"""Return composed version of multiple included versions."""
481
for origin, lineno, text in self._extract(included):
485
def dump(self, to_file):
486
from pprint import pprint
487
print >>to_file, "Weave._l = ",
488
pprint(self._l, to_file)
489
print >>to_file, "Weave._v = ",
490
pprint(self._v, to_file)
494
def numversions(self):
496
assert l == len(self._sha1s)
500
def check(self, progress_bar=None):
501
# check no circular inclusions
502
for version in range(self.numversions()):
503
inclusions = list(self._v[version])
506
if inclusions[-1] >= version:
507
raise WeaveFormatError("invalid included version %d for index %d"
508
% (inclusions[-1], version))
510
# try extracting all versions; this is a bit slow and parallel
511
# extraction could be used
513
nv = self.numversions()
514
for version in range(nv):
516
progress_bar.update('checking text', version, nv)
518
for l in self.get_iter(version):
521
expected = self._sha1s[version]
523
raise WeaveError("mismatched sha1 for version %d; "
524
"got %s, expected %s"
525
% (version, hd, expected))
527
# TODO: check insertions are properly nested, that there are
528
# no lines outside of insertion blocks, that deletions are
529
# properly paired, etc.
533
def merge(self, merge_versions):
534
"""Automerge and mark conflicts between versions.
536
This returns a sequence, each entry describing alternatives
537
for a chunk of the file. Each of the alternatives is given as
540
If there is a chunk of the file where there's no diagreement,
541
only one alternative is given.
544
# approach: find the included versions common to all the
546
raise NotImplementedError()
550
def _delta(self, included, lines):
551
"""Return changes from basis to new revision.
553
The old text for comparison is the union of included revisions.
555
This is used in inserting a new text.
557
Delta is returned as a sequence of
558
(weave1, weave2, newlines).
560
This indicates that weave1:weave2 of the old weave should be
561
replaced by the sequence of lines in newlines. Note that
562
these line numbers are positions in the total weave and don't
563
correspond to the lines in any extracted version, or even the
564
extracted union of included versions.
566
If line1=line2, this is a pure insert; if newlines=[] this is a
567
pure delete. (Similar to difflib.)
572
def plan_merge(self, ver_a, ver_b):
573
"""Return pseudo-annotation indicating how the two versions merge.
575
This is computed between versions a and b and their common
578
Weave lines present in none of them are skipped entirely.
580
inc_a = self.inclusions([ver_a])
581
inc_b = self.inclusions([ver_b])
582
inc_c = inc_a & inc_b
584
for lineno, insert, deleteset, line in self._walk():
585
if deleteset & inc_c:
586
# killed in parent; can't be in either a or b
587
# not relevant to our work
588
yield 'killed-base', line
589
elif insert in inc_c:
590
# was inserted in base
591
killed_a = bool(deleteset & inc_a)
592
killed_b = bool(deleteset & inc_b)
593
if killed_a and killed_b:
594
yield 'killed-both', line
596
yield 'killed-a', line
598
yield 'killed-b', line
600
yield 'unchanged', line
601
elif insert in inc_a:
602
if deleteset & inc_a:
603
yield 'ghost-a', line
607
elif insert in inc_b:
608
if deleteset & inc_b:
609
yield 'ghost-b', line
613
# not in either revision
614
yield 'irrelevant', line
616
yield 'unchanged', '' # terminator
620
def weave_merge(self, plan):
625
for state, line in plan:
626
if state == 'unchanged' or state == 'killed-both':
627
# resync and flush queued conflicts changes if any
628
if not lines_a and not lines_b:
630
elif ch_a and not ch_b:
632
for l in lines_a: yield l
633
elif ch_b and not ch_a:
634
for l in lines_b: yield l
635
elif lines_a == lines_b:
636
for l in lines_a: yield l
639
for l in lines_a: yield l
641
for l in lines_b: yield l
648
if state == 'unchanged':
651
elif state == 'killed-a':
654
elif state == 'killed-b':
657
elif state == 'new-a':
660
elif state == 'new-b':
664
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
674
def weave_info(filename, out):
675
"""Show some text information about the weave."""
676
from weavefile import read_weave
677
wf = file(filename, 'rb')
679
# FIXME: doesn't work on pipes
680
weave_size = wf.tell()
681
print >>out, "weave file size %d bytes" % weave_size
682
print >>out, "weave contains %d versions" % len(w._v)
685
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
686
for i in (6, 6, 8, 40, 20):
689
for i in range(len(w._v)):
692
bytes = sum((len(a) for a in text))
694
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
700
print >>out, "versions total %d bytes" % total
701
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
705
print """bzr weave tool
707
Experimental tool for weave algorithm.
711
Create an empty weave file
712
weave get WEAVEFILE VERSION
713
Write out specified version.
714
weave check WEAVEFILE
715
Check consistency of all versions.
717
Display table of contents.
718
weave add WEAVEFILE [BASE...] < NEWTEXT
719
Add NEWTEXT, with specified parent versions.
720
weave annotate WEAVEFILE VERSION
721
Display origin of each line.
722
weave mash WEAVEFILE VERSION...
723
Display composite of all selected versions.
724
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
725
Auto-merge two versions and display conflicts.
729
% weave init foo.weave
731
% weave add foo.weave < foo.txt
734
(create updated version)
736
% weave get foo.weave 0 | diff -u - foo.txt
737
% weave add foo.weave 0 < foo.txt
740
% weave get foo.weave 0 > foo.txt (create forked version)
742
% weave add foo.weave 0 < foo.txt
745
% weave merge foo.weave 1 2 > foo.txt (merge them)
746
% vi foo.txt (resolve conflicts)
747
% weave add foo.weave 1 2 < foo.txt (commit merged version)
756
from weavefile import write_weave, read_weave
757
from bzrlib.progress import ProgressBar
765
return read_weave(file(argv[2], 'rb'))
771
# at the moment, based on everything in the file
772
parents = map(int, argv[3:])
773
lines = sys.stdin.readlines()
774
ver = w.add(parents, lines)
775
write_weave(w, file(argv[2], 'wb'))
776
print 'added version %d' % ver
779
if os.path.exists(fn):
780
raise IOError("file exists")
782
write_weave(w, file(fn, 'wb'))
783
elif cmd == 'get': # get one version
785
sys.stdout.writelines(w.get_iter(int(argv[3])))
787
elif cmd == 'mash': # get composite
789
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
791
elif cmd == 'annotate':
793
# newline is added to all lines regardless; too hard to get
794
# reasonable formatting otherwise
796
for origin, text in w.annotate(int(argv[3])):
797
text = text.rstrip('\r\n')
799
print ' | %s' % (text)
801
print '%5d | %s' % (origin, text)
805
weave_info(argv[2], sys.stdout)
812
print '%d versions ok' % w.numversions()
814
elif cmd == 'inclusions':
816
print ' '.join(map(str, w.inclusions([int(argv[3])])))
818
elif cmd == 'parents':
820
print ' '.join(map(str, w._v[int(argv[3])]))
822
elif cmd == 'plan-merge':
824
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
826
print '%14s | %s' % (state, line),
830
p = w.plan_merge(int(argv[3]), int(argv[4]))
831
sys.stdout.writelines(w.weave_merge(p))
833
elif cmd == 'mash-merge':
839
v1, v2 = map(int, argv[3:5])
841
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
843
base_lines = list(w.mash_iter(basis))
844
a_lines = list(w.get(v1))
845
b_lines = list(w.get(v2))
847
from bzrlib.merge3 import Merge3
848
m3 = Merge3(base_lines, a_lines, b_lines)
850
name_a = 'version %d' % v1
851
name_b = 'version %d' % v2
852
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
854
raise ValueError('unknown command %r' % cmd)
857
if __name__ == '__main__':
859
sys.exit(main(sys.argv))