3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# making _extract build and return a list, rather than being a generator
31
# TODO: Perhaps have copy method for Weave instances?
33
# XXX: If we do weaves this way, will a merge still behave the same
34
# way if it's done in a different order? That's a pretty desirable
37
# TODO: Nothing here so far assumes the lines are really \n newlines,
38
# rather than being split up in some other way. We could accomodate
39
# binaries, perhaps by naively splitting on \n or perhaps using
40
# something like a rolling checksum.
42
# TODO: Track version names as well as indexes.
44
# TODO: End marker for each version so we can stop reading?
46
# TODO: Check that no insertion occurs inside a deletion that was
47
# active in the version of the insertion.
49
# TODO: In addition to the SHA-1 check, perhaps have some code that
50
# checks structural constraints of the weave: ie that insertions are
51
# properly nested, that there is no text outside of an insertion, that
52
# insertions or deletions are not repeated, etc.
54
# TODO: Make the info command just show info, not extract everything:
55
# it can be much faster.
57
# TODO: Perhaps use long integers as sets instead of set objects; may
60
# TODO: Parallel-extract that passes back each line along with a
61
# description of which revisions include it. Nice for checking all
67
class WeaveError(Exception):
68
"""Exception in processing weave"""
71
class WeaveFormatError(WeaveError):
72
"""Weave invariant violated"""
76
"""weave - versioned text file storage.
78
A Weave manages versions of line-based text files, keeping track
79
of the originating version for each line.
81
To clients the "lines" of the file are represented as a list of strings.
82
These strings will typically have terminal newline characters, but
83
this is not required. In particular files commonly do not have a newline
84
at the end of the file.
86
Texts can be identified in either of two ways:
88
* a nonnegative index number.
90
* a version-id string.
92
Typically the index number will be valid only inside this weave and
93
the version-id is used to reference it in the larger world.
95
The weave is represented as a list mixing edit instructions and
96
literal text. Each entry in _l can be either a string (or
97
unicode), or a tuple. If a string, it means that the given line
98
should be output in the currently active revisions.
100
If a tuple, it gives a processing instruction saying in which
101
revisions the enclosed lines are active. The tuple has the form
102
(instruction, version).
104
The instruction can be '{' or '}' for an insertion block, and '['
105
and ']' for a deletion block respectively. The version is the
106
integer version index. There is no replace operator, only deletes
111
* A later version can delete lines that were introduced by any
112
number of ancestor versions; this implies that deletion
113
instructions can span insertion blocks without regard to the
114
insertion block's nesting.
116
* Similarly, deletions need not be properly nested with regard to
117
each other, because they might have been generated by
118
independent revisions.
120
* Insertions are always made by inserting a new bracketed block
121
into a single point in the previous weave. This implies they
122
can nest but not overlap, and the nesting must always have later
123
insertions on the inside.
125
* It doesn't seem very useful to have an active insertion
126
inside an inactive insertion, but it might happen.
128
* Therefore, all instructions are always"considered"; that
129
is passed onto and off the stack. An outer inactive block
130
doesn't disable an inner block.
132
* Lines are enabled if the most recent enclosing insertion is
133
active and none of the enclosing deletions are active.
135
* There is no point having a deletion directly inside its own
136
insertion; you might as well just not write it. And there
137
should be no way to get an earlier version deleting a later
144
List of parents, indexed by version number.
145
It is only necessary to store the minimal set of parents for
146
each version; the parent's parents are implied.
149
List of hex SHA-1 of each version, or None if not recorded.
157
def __eq__(self, other):
158
if not isinstance(other, Weave):
160
return self._v == other._v \
161
and self._l == other._l
164
def __ne__(self, other):
165
return not self.__eq__(other)
168
def add(self, parents, text):
169
"""Add a single text on top of the weave.
171
Returns the index number of the newly added version.
174
List or set of direct parent version numbers.
177
Sequence of lines to be added in the new version."""
178
## self._check_versions(parents)
179
## self._check_lines(text)
189
# TODO: It'd probably be faster to append things on to a new
190
# list rather than modifying the existing one, which is likely
191
# to cause a lot of copying.
194
ancestors = self.inclusions(parents)
195
delta = self._delta(ancestors, text)
197
# offset gives the number of lines that have been inserted
198
# into the weave up to the current point; if the original edit instruction
199
# says to change line A then we actually change (A+offset)
202
for i1, i2, newlines in delta:
205
assert i2 <= len(self._l)
207
# the deletion and insertion are handled separately.
208
# first delete the region.
210
self._l.insert(i1+offset, ('[', idx))
211
self._l.insert(i2+offset+1, (']', idx))
216
# there may have been a deletion spanning up to
217
# i2; we want to insert after this region to make sure
218
# we don't destroy ourselves
220
self._l[i:i] = [('{', idx)] \
223
offset += 2 + len(newlines)
225
self._addversion(parents)
227
# special case; adding with no parents revision; can do this
228
# more quickly by just appending unconditionally
229
self._l.append(('{', idx))
231
self._l.append(('}', idx))
233
self._addversion(None)
235
self._sha1s.append(sha1)
240
def inclusions(self, versions):
241
"""Return set of all ancestors of given version(s)."""
247
# include all its parents
252
raise ValueError("version %d not present in weave" % v)
255
def minimal_parents(self, version):
256
"""Find the minimal set of parents for the version."""
257
included = self._v[version]
262
li.sort(reverse=True)
270
gotit.update(self.inclusions(pv))
272
assert mininc[0] >= 0
273
assert mininc[-1] < version
277
def _addversion(self, parents):
279
self._v.append(parents)
281
self._v.append(set())
284
def _check_lines(self, text):
285
if not isinstance(text, list):
286
raise ValueError("text should be a list, not %s" % type(text))
289
if not isinstance(l, basestring):
290
raise ValueError("text line should be a string or unicode, not %s"
295
def _check_versions(self, indexes):
296
"""Check everything in the sequence of indexes is valid"""
301
raise IndexError("invalid version number %r" % i)
304
def annotate(self, index):
305
return list(self.annotate_iter(index))
308
def annotate_iter(self, version):
309
"""Yield list of (index-id, line) pairs for the specified version.
311
The index indicates when the line originated in the weave."""
312
for origin, lineno, text in self._extract([version]):
320
(lineno, insert, deletes, text)
321
for each literal line.
327
lineno = 0 # line of weave, 0-based
330
if isinstance(l, tuple):
343
raise WeaveFormatError('unexpected instruction %r'
346
assert isinstance(l, basestring)
348
yield lineno, istack[-1], dset, l
353
def _extract(self, versions):
354
"""Yield annotation of lines in included set.
356
Yields a sequence of tuples (origin, lineno, text), where
357
origin is the origin version, lineno the index in the weave,
358
and text the text of the line.
360
The set typically but not necessarily corresponds to a version.
362
included = self.inclusions(versions)
367
lineno = 0 # line of weave, 0-based
373
WFE = WeaveFormatError
376
if isinstance(l, tuple):
380
assert v not in istack
395
assert isinstance(l, basestring)
397
isactive = (not dset) and istack and (istack[-1] in included)
399
result.append((istack[-1], lineno, l))
403
raise WFE("unclosed insertion blocks at end of weave",
406
raise WFE("unclosed deletion blocks at end of weave",
413
def get_iter(self, version):
414
"""Yield lines for the specified version."""
415
for origin, lineno, line in self._extract([version]):
419
def get(self, index):
420
return list(self.get_iter(index))
423
def mash_iter(self, included):
424
"""Return composed version of multiple included versions."""
425
for origin, lineno, text in self._extract(included):
429
def dump(self, to_file):
430
from pprint import pprint
431
print >>to_file, "Weave._l = ",
432
pprint(self._l, to_file)
433
print >>to_file, "Weave._v = ",
434
pprint(self._v, to_file)
438
def numversions(self):
440
assert l == len(self._sha1s)
444
def check(self, progress_bar=None):
445
# check no circular inclusions
446
for version in range(self.numversions()):
447
inclusions = list(self._v[version])
450
if inclusions[-1] >= version:
451
raise WeaveFormatError("invalid included version %d for index %d"
452
% (inclusions[-1], version))
454
# try extracting all versions; this is a bit slow and parallel
455
# extraction could be used
457
nv = self.numversions()
458
for version in range(nv):
460
progress_bar.update('checking text', version, nv)
462
for l in self.get_iter(version):
465
expected = self._sha1s[version]
467
raise WeaveError("mismatched sha1 for version %d; "
468
"got %s, expected %s"
469
% (version, hd, expected))
471
# TODO: check insertions are properly nested, that there are
472
# no lines outside of insertion blocks, that deletions are
473
# properly paired, etc.
477
def merge(self, merge_versions):
478
"""Automerge and mark conflicts between versions.
480
This returns a sequence, each entry describing alternatives
481
for a chunk of the file. Each of the alternatives is given as
484
If there is a chunk of the file where there's no diagreement,
485
only one alternative is given.
488
# approach: find the included versions common to all the
490
raise NotImplementedError()
494
def _delta(self, included, lines):
495
"""Return changes from basis to new revision.
497
The old text for comparison is the union of included revisions.
499
This is used in inserting a new text.
501
Delta is returned as a sequence of
502
(weave1, weave2, newlines).
504
This indicates that weave1:weave2 of the old weave should be
505
replaced by the sequence of lines in newlines. Note that
506
these line numbers are positions in the total weave and don't
507
correspond to the lines in any extracted version, or even the
508
extracted union of included versions.
510
If line1=line2, this is a pure insert; if newlines=[] this is a
511
pure delete. (Similar to difflib.)
513
# basis a list of (origin, lineno, line)
516
for origin, lineno, line in self._extract(included):
517
basis_lineno.append(lineno)
518
basis_lines.append(line)
520
# add a sentinal, because we can also match against the final line
521
basis_lineno.append(len(self._l))
523
# XXX: which line of the weave should we really consider
524
# matches the end of the file? the current code says it's the
525
# last line of the weave?
527
from difflib import SequenceMatcher
528
s = SequenceMatcher(None, basis_lines, lines)
530
# TODO: Perhaps return line numbers from composed weave as well?
532
for tag, i1, i2, j1, j2 in s.get_opcodes():
533
##print tag, i1, i2, j1, j2
538
# i1,i2 are given in offsets within basis_lines; we need to map them
539
# back to offsets within the entire weave
540
real_i1 = basis_lineno[i1]
541
real_i2 = basis_lineno[i2]
545
assert j2 <= len(lines)
547
yield real_i1, real_i2, lines[j1:j2]
551
def plan_merge(self, ver_a, ver_b):
552
"""Return pseudo-annotation indicating how the two versions merge.
554
This is computed between versions a and b and their common
557
Weave lines present in none of them are skipped entirely.
559
inc_a = self.inclusions([ver_a])
560
inc_b = self.inclusions([ver_b])
561
inc_c = inc_a & inc_b
563
for lineno, insert, deleteset, line in self._walk():
564
if deleteset & inc_c:
565
# killed in parent; can't be in either a or b
566
# not relevant to our work
567
yield 'killed-base', line
568
elif insert in inc_c:
569
# was inserted in base
570
killed_a = bool(deleteset & inc_a)
571
killed_b = bool(deleteset & inc_b)
572
if killed_a and killed_b:
573
yield 'killed-both', line
575
yield 'killed-a', line
577
yield 'killed-b', line
579
yield 'unchanged', line
580
elif insert in inc_a:
581
if deleteset & inc_a:
582
yield 'ghost-a', line
586
elif insert in inc_b:
587
if deleteset & inc_b:
588
yield 'ghost-b', line
592
# not in either revision
593
yield 'irrelevant', line
595
yield 'unchanged', '' # terminator
599
def weave_merge(self, plan):
604
for state, line in plan:
605
if state == 'unchanged' or state == 'killed-both':
606
# resync and flush queued conflicts changes if any
607
if not lines_a and not lines_b:
609
elif ch_a and not ch_b:
611
for l in lines_a: yield l
612
elif ch_b and not ch_a:
613
for l in lines_b: yield l
614
elif lines_a == lines_b:
615
for l in lines_a: yield l
618
for l in lines_a: yield l
620
for l in lines_b: yield l
627
if state == 'unchanged':
630
elif state == 'killed-a':
633
elif state == 'killed-b':
636
elif state == 'new-a':
639
elif state == 'new-b':
643
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
653
def weave_info(filename, out):
654
"""Show some text information about the weave."""
655
from weavefile import read_weave
656
wf = file(filename, 'rb')
658
# FIXME: doesn't work on pipes
659
weave_size = wf.tell()
660
print >>out, "weave file size %d bytes" % weave_size
661
print >>out, "weave contains %d versions" % len(w._v)
664
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
665
for i in (6, 6, 8, 40, 20):
668
for i in range(len(w._v)):
671
bytes = sum((len(a) for a in text))
673
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
679
print >>out, "versions total %d bytes" % total
680
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
684
print """bzr weave tool
686
Experimental tool for weave algorithm.
690
Create an empty weave file
691
weave get WEAVEFILE VERSION
692
Write out specified version.
693
weave check WEAVEFILE
694
Check consistency of all versions.
696
Display table of contents.
697
weave add WEAVEFILE [BASE...] < NEWTEXT
698
Add NEWTEXT, with specified parent versions.
699
weave annotate WEAVEFILE VERSION
700
Display origin of each line.
701
weave mash WEAVEFILE VERSION...
702
Display composite of all selected versions.
703
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
704
Auto-merge two versions and display conflicts.
708
% weave init foo.weave
710
% weave add foo.weave < foo.txt
713
(create updated version)
715
% weave get foo.weave 0 | diff -u - foo.txt
716
% weave add foo.weave 0 < foo.txt
719
% weave get foo.weave 0 > foo.txt (create forked version)
721
% weave add foo.weave 0 < foo.txt
724
% weave merge foo.weave 1 2 > foo.txt (merge them)
725
% vi foo.txt (resolve conflicts)
726
% weave add foo.weave 1 2 < foo.txt (commit merged version)
735
from weavefile import write_weave, read_weave
736
from bzrlib.progress import ProgressBar
744
return read_weave(file(argv[2], 'rb'))
750
# at the moment, based on everything in the file
751
parents = map(int, argv[3:])
752
lines = sys.stdin.readlines()
753
ver = w.add(parents, lines)
754
write_weave(w, file(argv[2], 'wb'))
755
print 'added version %d' % ver
758
if os.path.exists(fn):
759
raise IOError("file exists")
761
write_weave(w, file(fn, 'wb'))
762
elif cmd == 'get': # get one version
764
sys.stdout.writelines(w.get_iter(int(argv[3])))
766
elif cmd == 'mash': # get composite
768
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
770
elif cmd == 'annotate':
772
# newline is added to all lines regardless; too hard to get
773
# reasonable formatting otherwise
775
for origin, text in w.annotate(int(argv[3])):
776
text = text.rstrip('\r\n')
778
print ' | %s' % (text)
780
print '%5d | %s' % (origin, text)
784
weave_info(argv[2], sys.stdout)
792
elif cmd == 'inclusions':
794
print ' '.join(map(str, w.inclusions([int(argv[3])])))
796
elif cmd == 'parents':
798
print ' '.join(map(str, w._v[int(argv[3])]))
800
elif cmd == 'plan-merge':
802
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
804
print '%14s | %s' % (state, line),
808
p = w.plan_merge(int(argv[3]), int(argv[4]))
809
sys.stdout.writelines(w.weave_merge(p))
811
elif cmd == 'mash-merge':
817
v1, v2 = map(int, argv[3:5])
819
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
821
base_lines = list(w.mash_iter(basis))
822
a_lines = list(w.get(v1))
823
b_lines = list(w.get(v2))
825
from bzrlib.merge3 import Merge3
826
m3 = Merge3(base_lines, a_lines, b_lines)
828
name_a = 'version %d' % v1
829
name_b = 'version %d' % v2
830
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
832
raise ValueError('unknown command %r' % cmd)
835
if __name__ == '__main__':
837
sys.exit(main(sys.argv))