3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# TODO: Perhaps have copy method for Weave instances?
26
# XXX: If we do weaves this way, will a merge still behave the same
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: Nothing here so far assumes the lines are really \n newlines,
31
# rather than being split up in some other way. We could accomodate
32
# binaries, perhaps by naively splitting on \n or perhaps using
33
# something like a rolling checksum.
35
# TODO: Track version names as well as indexes.
37
# TODO: End marker for each version so we can stop reading?
39
# TODO: Check that no insertion occurs inside a deletion that was
40
# active in the version of the insertion.
42
# TODO: In addition to the SHA-1 check, perhaps have some code that
43
# checks structural constraints of the weave: ie that insertions are
44
# properly nested, that there is no text outside of an insertion, that
45
# insertions or deletions are not repeated, etc.
47
# TODO: Make the info command just show info, not extract everything:
48
# it can be much faster.
50
# TODO: Perhaps use long integers as sets instead of set objects; may
53
# TODO: Parallel-extract that passes back each line along with a
54
# description of which revisions include it. Nice for checking all
64
from sets import Set, ImmutableSet
66
frozenset = ImmutableSet
71
class WeaveError(Exception):
72
"""Exception in processing weave"""
75
class WeaveFormatError(WeaveError):
76
"""Weave invariant violated"""
80
"""weave - versioned text file storage.
82
A Weave manages versions of line-based text files, keeping track
83
of the originating version for each line.
85
To clients the "lines" of the file are represented as a list of strings.
86
These strings will typically have terminal newline characters, but
87
this is not required. In particular files commonly do not have a newline
88
at the end of the file.
90
Texts can be identified in either of two ways:
92
* a nonnegative index number.
94
* a version-id string.
96
Typically the index number will be valid only inside this weave and
97
the version-id is used to reference it in the larger world.
99
The weave is represented as a list mixing edit instructions and
100
literal text. Each entry in _l can be either a string (or
101
unicode), or a tuple. If a string, it means that the given line
102
should be output in the currently active revisions.
104
If a tuple, it gives a processing instruction saying in which
105
revisions the enclosed lines are active. The tuple has the form
106
(instruction, version).
108
The instruction can be '{' or '}' for an insertion block, and '['
109
and ']' for a deletion block respectively. The version is the
110
integer version index. There is no replace operator, only deletes
115
* A later version can delete lines that were introduced by any
116
number of ancestor versions; this implies that deletion
117
instructions can span insertion blocks without regard to the
118
insertion block's nesting.
120
* Similarly, deletions need not be properly nested with regard to
121
each other, because they might have been generated by
122
independent revisions.
124
* Insertions are always made by inserting a new bracketed block
125
into a single point in the previous weave. This implies they
126
can nest but not overlap, and the nesting must always have later
127
insertions on the inside.
129
* It doesn't seem very useful to have an active insertion
130
inside an inactive insertion, but it might happen.
132
* Therefore, all instructions are always"considered"; that
133
is passed onto and off the stack. An outer inactive block
134
doesn't disable an inner block.
136
* Lines are enabled if the most recent enclosing insertion is
137
active and none of the enclosing deletions are active.
139
* There is no point having a deletion directly inside its own
140
insertion; you might as well just not write it. And there
141
should be no way to get an earlier version deleting a later
148
List of parents, indexed by version number.
149
It is only necessary to store the minimal set of parents for
150
each version; the parent's parents are implied.
153
List of hex SHA-1 of each version, or None if not recorded.
161
def __eq__(self, other):
162
if not isinstance(other, Weave):
164
return self._v == other._v \
165
and self._l == other._l
168
def __ne__(self, other):
169
return not self.__eq__(other)
172
def add(self, parents, text):
173
"""Add a single text on top of the weave.
175
Returns the index number of the newly added version.
178
List or set of direct parent version numbers.
181
Sequence of lines to be added in the new version."""
182
## self._check_versions(parents)
183
## self._check_lines(text)
193
# TODO: It'd probably be faster to append things on to a new
194
# list rather than modifying the existing one, which is likely
195
# to cause a lot of copying.
198
ancestors = self.inclusions(parents)
199
delta = self._delta(ancestors, text)
201
# offset gives the number of lines that have been inserted
202
# into the weave up to the current point; if the original edit instruction
203
# says to change line A then we actually change (A+offset)
206
for i1, i2, newlines in delta:
209
assert i2 <= len(self._l)
211
# the deletion and insertion are handled separately.
212
# first delete the region.
214
self._l.insert(i1+offset, ('[', idx))
215
self._l.insert(i2+offset+1, (']', idx))
220
# there may have been a deletion spanning up to
221
# i2; we want to insert after this region to make sure
222
# we don't destroy ourselves
224
self._l[i:i] = [('{', idx)] \
227
offset += 2 + len(newlines)
229
self._addversion(parents)
231
# special case; adding with no parents revision; can do this
232
# more quickly by just appending unconditionally
233
self._l.append(('{', idx))
235
self._l.append(('}', idx))
237
self._addversion(None)
239
self._sha1s.append(sha1)
244
def inclusions_bitset(self, versions):
251
# if v is included, include all its parents
252
for pv in self._v[v]:
258
def inclusions(self, versions):
259
"""Return set of all ancestors of given version(s)."""
260
from bzrlib.intset import IntSet
267
# include all its parents
272
raise ValueError("version %d not present in weave" % v)
275
def minimal_parents(self, version):
276
"""Find the minimal set of parents for the version."""
277
from bzrlib.intset import IntSet
279
included = self._v[version]
284
li.sort(reverse=True)
292
gotit.update(self.inclusions(pv))
294
assert mininc[0] >= 0
295
assert mininc[-1] < version
299
def _addversion(self, parents):
301
self._v.append(parents)
303
self._v.append(frozenset())
306
def _check_lines(self, text):
307
if not isinstance(text, list):
308
raise ValueError("text should be a list, not %s" % type(text))
311
if not isinstance(l, basestring):
312
raise ValueError("text line should be a string or unicode, not %s"
317
def _check_versions(self, indexes):
318
"""Check everything in the sequence of indexes is valid"""
323
raise IndexError("invalid version number %r" % i)
326
def annotate(self, index):
327
return list(self.annotate_iter(index))
330
def annotate_iter(self, version):
331
"""Yield list of (index-id, line) pairs for the specified version.
333
The index indicates when the line originated in the weave."""
334
for origin, lineno, text in self._extract([version]):
342
(lineno, insert, deletes, text)
343
for each literal line.
349
lineno = 0 # line of weave, 0-based
352
if isinstance(l, tuple):
361
assert not (dset & vs)
368
raise WeaveFormatError('unexpected instruction %r'
371
assert isinstance(l, basestring)
373
yield lineno, istack[-1], dset, l
378
def _extract(self, versions):
379
"""Yield annotation of lines in included set.
381
Yields a sequence of tuples (origin, lineno, text), where
382
origin is the origin version, lineno the index in the weave,
383
and text the text of the line.
385
The set typically but not necessarily corresponds to a version.
387
included = self.inclusions(versions)
392
lineno = 0 # line of weave, 0-based
396
WFE = WeaveFormatError
399
if isinstance(l, tuple):
403
assert v not in istack
418
assert isinstance(l, basestring)
420
isactive = (not dset) and istack and (istack[-1] in included)
422
yield istack[-1], lineno, l
426
raise WFE("unclosed insertion blocks at end of weave",
429
raise WFE("unclosed deletion blocks at end of weave",
433
def get_iter(self, version):
434
"""Yield lines for the specified version."""
435
for origin, lineno, line in self._extract([version]):
439
def get(self, index):
440
return list(self.get_iter(index))
443
def mash_iter(self, included):
444
"""Return composed version of multiple included versions."""
445
included = frozenset(included)
446
for origin, lineno, text in self._extract(included):
450
def dump(self, to_file):
451
from pprint import pprint
452
print >>to_file, "Weave._l = ",
453
pprint(self._l, to_file)
454
print >>to_file, "Weave._v = ",
455
pprint(self._v, to_file)
459
def numversions(self):
461
assert l == len(self._sha1s)
465
def check(self, progress_bar=None):
466
# check no circular inclusions
467
for version in range(self.numversions()):
468
inclusions = list(self._v[version])
471
if inclusions[-1] >= version:
472
raise WeaveFormatError("invalid included version %d for index %d"
473
% (inclusions[-1], version))
475
# try extracting all versions; this is a bit slow and parallel
476
# extraction could be used
478
nv = self.numversions()
479
for version in range(nv):
481
progress_bar.update('checking text', version, nv)
483
for l in self.get_iter(version):
486
expected = self._sha1s[version]
488
raise WeaveError("mismatched sha1 for version %d; "
489
"got %s, expected %s"
490
% (version, hd, expected))
492
# TODO: check insertions are properly nested, that there are
493
# no lines outside of insertion blocks, that deletions are
494
# properly paired, etc.
498
def merge(self, merge_versions):
499
"""Automerge and mark conflicts between versions.
501
This returns a sequence, each entry describing alternatives
502
for a chunk of the file. Each of the alternatives is given as
505
If there is a chunk of the file where there's no diagreement,
506
only one alternative is given.
509
# approach: find the included versions common to all the
511
raise NotImplementedError()
515
def _delta(self, included, lines):
516
"""Return changes from basis to new revision.
518
The old text for comparison is the union of included revisions.
520
This is used in inserting a new text.
522
Delta is returned as a sequence of
523
(weave1, weave2, newlines).
525
This indicates that weave1:weave2 of the old weave should be
526
replaced by the sequence of lines in newlines. Note that
527
these line numbers are positions in the total weave and don't
528
correspond to the lines in any extracted version, or even the
529
extracted union of included versions.
531
If line1=line2, this is a pure insert; if newlines=[] this is a
532
pure delete. (Similar to difflib.)
534
# basis a list of (origin, lineno, line)
537
for origin, lineno, line in self._extract(included):
538
basis_lineno.append(lineno)
539
basis_lines.append(line)
541
# add a sentinal, because we can also match against the final line
542
basis_lineno.append(len(self._l))
544
# XXX: which line of the weave should we really consider
545
# matches the end of the file? the current code says it's the
546
# last line of the weave?
548
from difflib import SequenceMatcher
549
s = SequenceMatcher(None, basis_lines, lines)
551
# TODO: Perhaps return line numbers from composed weave as well?
553
for tag, i1, i2, j1, j2 in s.get_opcodes():
554
##print tag, i1, i2, j1, j2
559
# i1,i2 are given in offsets within basis_lines; we need to map them
560
# back to offsets within the entire weave
561
real_i1 = basis_lineno[i1]
562
real_i2 = basis_lineno[i2]
566
assert j2 <= len(lines)
568
yield real_i1, real_i2, lines[j1:j2]
572
def plan_merge(self, ver_a, ver_b):
573
"""Return pseudo-annotation indicating how the two versions merge.
575
This is computed between versions a and b and their common
578
Weave lines present in none of them are skipped entirely.
580
inc_a = self.inclusions_bitset([ver_a])
581
inc_b = self.inclusions_bitset([ver_b])
582
inc_c = inc_a & inc_b
584
for lineno, insert, deleteset, line in self._walk():
585
insertset = (1L << insert)
586
if deleteset & inc_c:
587
# killed in parent; can't be in either a or b
588
# not relevant to our work
589
yield 'killed-base', line
590
elif insertset & inc_c:
591
# was inserted in base
592
killed_a = bool(deleteset & inc_a)
593
killed_b = bool(deleteset & inc_b)
594
if killed_a and killed_b:
595
yield 'killed-both', line
597
yield 'killed-a', line
599
yield 'killed-b', line
601
yield 'unchanged', line
602
elif insertset & inc_a:
603
if deleteset & inc_a:
604
yield 'ghost-a', line
608
elif insertset & inc_b:
609
if deleteset & inc_b:
610
yield 'ghost-b', line
614
# not in either revision
615
yield 'irrelevant', line
617
yield 'unchanged', '' # terminator
621
def weave_merge(self, plan):
626
for state, line in plan:
627
if state == 'unchanged' or state == 'killed-both':
628
# resync and flush queued conflicts changes if any
629
if not lines_a and not lines_b:
631
elif ch_a and not ch_b:
633
for l in lines_a: yield l
634
elif ch_b and not ch_a:
635
for l in lines_b: yield l
636
elif lines_a == lines_b:
637
for l in lines_a: yield l
640
for l in lines_a: yield l
642
for l in lines_b: yield l
649
if state == 'unchanged':
652
elif state == 'killed-a':
655
elif state == 'killed-b':
658
elif state == 'new-a':
661
elif state == 'new-b':
665
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
675
def weave_info(filename, out):
676
"""Show some text information about the weave."""
677
from weavefile import read_weave
678
wf = file(filename, 'rb')
680
# FIXME: doesn't work on pipes
681
weave_size = wf.tell()
682
print >>out, "weave file size %d bytes" % weave_size
683
print >>out, "weave contains %d versions" % len(w._v)
686
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
687
for i in (6, 6, 8, 40, 20):
690
for i in range(len(w._v)):
693
bytes = sum((len(a) for a in text))
695
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
701
print >>out, "versions total %d bytes" % total
702
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
706
print """bzr weave tool
708
Experimental tool for weave algorithm.
712
Create an empty weave file
713
weave get WEAVEFILE VERSION
714
Write out specified version.
715
weave check WEAVEFILE
716
Check consistency of all versions.
718
Display table of contents.
719
weave add WEAVEFILE [BASE...] < NEWTEXT
720
Add NEWTEXT, with specified parent versions.
721
weave annotate WEAVEFILE VERSION
722
Display origin of each line.
723
weave mash WEAVEFILE VERSION...
724
Display composite of all selected versions.
725
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
726
Auto-merge two versions and display conflicts.
730
% weave init foo.weave
732
% weave add foo.weave < foo.txt
735
(create updated version)
737
% weave get foo.weave 0 | diff -u - foo.txt
738
% weave add foo.weave 0 < foo.txt
741
% weave get foo.weave 0 > foo.txt (create forked version)
743
% weave add foo.weave 0 < foo.txt
746
% weave merge foo.weave 1 2 > foo.txt (merge them)
747
% vi foo.txt (resolve conflicts)
748
% weave add foo.weave 1 2 < foo.txt (commit merged version)
757
from weavefile import write_weave, read_weave
758
from bzrlib.progress import ProgressBar
766
return read_weave(file(argv[2], 'rb'))
772
# at the moment, based on everything in the file
773
parents = map(int, argv[3:])
774
lines = sys.stdin.readlines()
775
ver = w.add(parents, lines)
776
write_weave(w, file(argv[2], 'wb'))
777
print 'added version %d' % ver
780
if os.path.exists(fn):
781
raise IOError("file exists")
783
write_weave(w, file(fn, 'wb'))
784
elif cmd == 'get': # get one version
786
sys.stdout.writelines(w.get_iter(int(argv[3])))
788
elif cmd == 'mash': # get composite
790
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
792
elif cmd == 'annotate':
794
# newline is added to all lines regardless; too hard to get
795
# reasonable formatting otherwise
797
for origin, text in w.annotate(int(argv[3])):
798
text = text.rstrip('\r\n')
800
print ' | %s' % (text)
802
print '%5d | %s' % (origin, text)
806
weave_info(argv[2], sys.stdout)
814
elif cmd == 'inclusions':
816
print ' '.join(map(str, w.inclusions([int(argv[3])])))
818
elif cmd == 'parents':
820
print ' '.join(map(str, w._v[int(argv[3])]))
822
elif cmd == 'plan-merge':
824
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
826
print '%14s | %s' % (state, line),
830
p = w.plan_merge(int(argv[3]), int(argv[4]))
831
sys.stdout.writelines(w.weave_merge(p))
833
elif cmd == 'mash-merge':
839
v1, v2 = map(int, argv[3:5])
841
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
843
base_lines = list(w.mash_iter(basis))
844
a_lines = list(w.get(v1))
845
b_lines = list(w.get(v2))
847
from bzrlib.merge3 import Merge3
848
m3 = Merge3(base_lines, a_lines, b_lines)
850
name_a = 'version %d' % v1
851
name_b = 'version %d' % v2
852
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
854
raise ValueError('unknown command %r' % cmd)
857
if __name__ == '__main__':
859
sys.exit(main(sys.argv))