3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# TODO: Perhaps have copy method for Weave instances?
26
# XXX: If we do weaves this way, will a merge still behave the same
27
# way if it's done in a different order? That's a pretty desirable
30
# TODO: Nothing here so far assumes the lines are really \n newlines,
31
# rather than being split up in some other way. We could accomodate
32
# binaries, perhaps by naively splitting on \n or perhaps using
33
# something like a rolling checksum.
35
# TODO: Track version names as well as indexes.
37
# TODO: End marker for each version so we can stop reading?
39
# TODO: Check that no insertion occurs inside a deletion that was
40
# active in the version of the insertion.
42
# TODO: In addition to the SHA-1 check, perhaps have some code that
43
# checks structural constraints of the weave: ie that insertions are
44
# properly nested, that there is no text outside of an insertion, that
45
# insertions or deletions are not repeated, etc.
47
# TODO: Make the info command just show info, not extract everything:
48
# it can be much faster.
50
# TODO: Perhaps use long integers as sets instead of set objects; may
53
# TODO: Parallel-extract that passes back each line along with a
54
# description of which revisions include it. Nice for checking all
59
from bzrlib.intset import IntSet
63
class WeaveError(Exception):
64
"""Exception in processing weave"""
67
class WeaveFormatError(WeaveError):
68
"""Weave invariant violated"""
72
"""weave - versioned text file storage.
74
A Weave manages versions of line-based text files, keeping track
75
of the originating version for each line.
77
To clients the "lines" of the file are represented as a list of strings.
78
These strings will typically have terminal newline characters, but
79
this is not required. In particular files commonly do not have a newline
80
at the end of the file.
82
Texts can be identified in either of two ways:
84
* a nonnegative index number.
86
* a version-id string.
88
Typically the index number will be valid only inside this weave and
89
the version-id is used to reference it in the larger world.
91
The weave is represented as a list mixing edit instructions and
92
literal text. Each entry in _l can be either a string (or
93
unicode), or a tuple. If a string, it means that the given line
94
should be output in the currently active revisions.
96
If a tuple, it gives a processing instruction saying in which
97
revisions the enclosed lines are active. The tuple has the form
98
(instruction, version).
100
The instruction can be '{' or '}' for an insertion block, and '['
101
and ']' for a deletion block respectively. The version is the
102
integer version index. There is no replace operator, only deletes
107
* A later version can delete lines that were introduced by any
108
number of ancestor versions; this implies that deletion
109
instructions can span insertion blocks without regard to the
110
insertion block's nesting.
112
* Similarly, deletions need not be properly nested with regard to
113
each other, because they might have been generated by
114
independent revisions.
116
* Insertions are always made by inserting a new bracketed block
117
into a single point in the previous weave. This implies they
118
can nest but not overlap, and the nesting must always have later
119
insertions on the inside.
121
* It doesn't seem very useful to have an active insertion
122
inside an inactive insertion, but it might happen.
124
* Therefore, all instructions are always"considered"; that
125
is passed onto and off the stack. An outer inactive block
126
doesn't disable an inner block.
128
* Lines are enabled if the most recent enclosing insertion is
129
active and none of the enclosing deletions are active.
131
* There is no point having a deletion directly inside its own
132
insertion; you might as well just not write it. And there
133
should be no way to get an earlier version deleting a later
140
List of parents, indexed by version number.
141
It is only necessary to store the minimal set of parents for
142
each version; the parent's parents are implied.
145
List of hex SHA-1 of each version, or None if not recorded.
153
def __eq__(self, other):
154
if not isinstance(other, Weave):
156
return self._v == other._v \
157
and self._l == other._l
160
def __ne__(self, other):
161
return not self.__eq__(other)
164
def add(self, parents, text):
165
"""Add a single text on top of the weave.
167
Returns the index number of the newly added version.
170
List or set of direct parent version numbers.
173
Sequence of lines to be added in the new version."""
174
## self._check_versions(parents)
175
## self._check_lines(text)
185
# TODO: It'd probably be faster to append things on to a new
186
# list rather than modifying the existing one, which is likely
187
# to cause a lot of copying.
190
ancestors = self.inclusions(parents)
191
delta = self._delta(ancestors, text)
193
# offset gives the number of lines that have been inserted
194
# into the weave up to the current point; if the original edit instruction
195
# says to change line A then we actually change (A+offset)
198
for i1, i2, newlines in delta:
201
assert i2 <= len(self._l)
203
# the deletion and insertion are handled separately.
204
# first delete the region.
206
self._l.insert(i1+offset, ('[', idx))
207
self._l.insert(i2+offset+1, (']', idx))
212
# there may have been a deletion spanning up to
213
# i2; we want to insert after this region to make sure
214
# we don't destroy ourselves
216
self._l[i:i] = [('{', idx)] \
219
offset += 2 + len(newlines)
221
self._addversion(parents)
223
# special case; adding with no parents revision; can do this
224
# more quickly by just appending unconditionally
225
self._l.append(('{', idx))
227
self._l.append(('}', idx))
229
self._addversion(None)
231
self._sha1s.append(sha1)
236
def inclusions(self, versions):
237
"""Return set of all ancestors of given version(s)."""
243
# include all its parents
248
raise ValueError("version %d not present in weave" % v)
251
def minimal_parents(self, version):
252
"""Find the minimal set of parents for the version."""
253
included = self._v[version]
258
li.sort(reverse=True)
266
gotit.update(self.inclusions(pv))
268
assert mininc[0] >= 0
269
assert mininc[-1] < version
273
def _addversion(self, parents):
275
self._v.append(parents)
277
self._v.append(IntSet())
280
def _check_lines(self, text):
281
if not isinstance(text, list):
282
raise ValueError("text should be a list, not %s" % type(text))
285
if not isinstance(l, basestring):
286
raise ValueError("text line should be a string or unicode, not %s"
291
def _check_versions(self, indexes):
292
"""Check everything in the sequence of indexes is valid"""
297
raise IndexError("invalid version number %r" % i)
300
def annotate(self, index):
301
return list(self.annotate_iter(index))
304
def annotate_iter(self, version):
305
"""Yield list of (index-id, line) pairs for the specified version.
307
The index indicates when the line originated in the weave."""
308
for origin, lineno, text in self._extract([version]):
316
(lineno, insert, deletes, text)
317
for each literal line.
323
lineno = 0 # line of weave, 0-based
326
if isinstance(l, tuple):
339
raise WeaveFormatError('unexpected instruction %r'
342
assert isinstance(l, basestring)
344
yield lineno, istack[-1], dset, l
349
def _extract(self, versions):
350
"""Yield annotation of lines in included set.
352
Yields a sequence of tuples (origin, lineno, text), where
353
origin is the origin version, lineno the index in the weave,
354
and text the text of the line.
356
The set typically but not necessarily corresponds to a version.
358
included = self.inclusions(versions)
363
lineno = 0 # line of weave, 0-based
367
WFE = WeaveFormatError
370
if isinstance(l, tuple):
374
assert v not in istack
389
assert isinstance(l, basestring)
391
isactive = (not dset) and istack and (istack[-1] in included)
393
yield istack[-1], lineno, l
397
raise WFE("unclosed insertion blocks at end of weave",
400
raise WFE("unclosed deletion blocks at end of weave",
404
def get_iter(self, version):
405
"""Yield lines for the specified version."""
406
for origin, lineno, line in self._extract([version]):
410
def get(self, index):
411
return list(self.get_iter(index))
414
def mash_iter(self, included):
415
"""Return composed version of multiple included versions."""
416
for origin, lineno, text in self._extract(included):
420
def dump(self, to_file):
421
from pprint import pprint
422
print >>to_file, "Weave._l = ",
423
pprint(self._l, to_file)
424
print >>to_file, "Weave._v = ",
425
pprint(self._v, to_file)
429
def numversions(self):
431
assert l == len(self._sha1s)
435
def check(self, progress_bar=None):
436
# check no circular inclusions
437
for version in range(self.numversions()):
438
inclusions = list(self._v[version])
441
if inclusions[-1] >= version:
442
raise WeaveFormatError("invalid included version %d for index %d"
443
% (inclusions[-1], version))
445
# try extracting all versions; this is a bit slow and parallel
446
# extraction could be used
448
nv = self.numversions()
449
for version in range(nv):
451
progress_bar.update('checking text', version, nv)
453
for l in self.get_iter(version):
456
expected = self._sha1s[version]
458
raise WeaveError("mismatched sha1 for version %d; "
459
"got %s, expected %s"
460
% (version, hd, expected))
462
# TODO: check insertions are properly nested, that there are
463
# no lines outside of insertion blocks, that deletions are
464
# properly paired, etc.
468
def merge(self, merge_versions):
469
"""Automerge and mark conflicts between versions.
471
This returns a sequence, each entry describing alternatives
472
for a chunk of the file. Each of the alternatives is given as
475
If there is a chunk of the file where there's no diagreement,
476
only one alternative is given.
479
# approach: find the included versions common to all the
481
raise NotImplementedError()
485
def _delta(self, included, lines):
486
"""Return changes from basis to new revision.
488
The old text for comparison is the union of included revisions.
490
This is used in inserting a new text.
492
Delta is returned as a sequence of
493
(weave1, weave2, newlines).
495
This indicates that weave1:weave2 of the old weave should be
496
replaced by the sequence of lines in newlines. Note that
497
these line numbers are positions in the total weave and don't
498
correspond to the lines in any extracted version, or even the
499
extracted union of included versions.
501
If line1=line2, this is a pure insert; if newlines=[] this is a
502
pure delete. (Similar to difflib.)
504
# basis a list of (origin, lineno, line)
507
for origin, lineno, line in self._extract(included):
508
basis_lineno.append(lineno)
509
basis_lines.append(line)
511
# add a sentinal, because we can also match against the final line
512
basis_lineno.append(len(self._l))
514
# XXX: which line of the weave should we really consider
515
# matches the end of the file? the current code says it's the
516
# last line of the weave?
518
from difflib import SequenceMatcher
519
s = SequenceMatcher(None, basis_lines, lines)
521
# TODO: Perhaps return line numbers from composed weave as well?
523
for tag, i1, i2, j1, j2 in s.get_opcodes():
524
##print tag, i1, i2, j1, j2
529
# i1,i2 are given in offsets within basis_lines; we need to map them
530
# back to offsets within the entire weave
531
real_i1 = basis_lineno[i1]
532
real_i2 = basis_lineno[i2]
536
assert j2 <= len(lines)
538
yield real_i1, real_i2, lines[j1:j2]
542
def plan_merge(self, ver_a, ver_b):
543
"""Return pseudo-annotation indicating how the two versions merge.
545
This is computed between versions a and b and their common
548
Weave lines present in none of them are skipped entirely.
550
inc_a = self.inclusions([ver_a])
551
inc_b = self.inclusions([ver_b])
552
inc_c = inc_a & inc_b
554
for lineno, insert, deleteset, line in self._walk():
555
if deleteset & inc_c:
556
# killed in parent; can't be in either a or b
557
# not relevant to our work
558
yield 'killed-base', line
559
elif insert in inc_c:
560
# was inserted in base
561
killed_a = bool(deleteset & inc_a)
562
killed_b = bool(deleteset & inc_b)
563
if killed_a and killed_b:
564
yield 'killed-both', line
566
yield 'killed-a', line
568
yield 'killed-b', line
570
yield 'unchanged', line
571
elif insert in inc_a:
572
if deleteset & inc_a:
573
yield 'ghost-a', line
577
elif insert in inc_b:
578
if deleteset & inc_b:
579
yield 'ghost-b', line
583
# not in either revision
584
yield 'irrelevant', line
586
yield 'unchanged', '' # terminator
590
def weave_merge(self, plan):
595
for state, line in plan:
596
if state == 'unchanged' or state == 'killed-both':
597
# resync and flush queued conflicts changes if any
598
if not lines_a and not lines_b:
600
elif ch_a and not ch_b:
602
for l in lines_a: yield l
603
elif ch_b and not ch_a:
604
for l in lines_b: yield l
605
elif lines_a == lines_b:
606
for l in lines_a: yield l
609
for l in lines_a: yield l
611
for l in lines_b: yield l
618
if state == 'unchanged':
621
elif state == 'killed-a':
624
elif state == 'killed-b':
627
elif state == 'new-a':
630
elif state == 'new-b':
634
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
644
def weave_info(filename, out):
645
"""Show some text information about the weave."""
646
from weavefile import read_weave
647
wf = file(filename, 'rb')
649
# FIXME: doesn't work on pipes
650
weave_size = wf.tell()
651
print >>out, "weave file size %d bytes" % weave_size
652
print >>out, "weave contains %d versions" % len(w._v)
655
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
656
for i in (6, 6, 8, 40, 20):
659
for i in range(len(w._v)):
662
bytes = sum((len(a) for a in text))
664
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
670
print >>out, "versions total %d bytes" % total
671
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
675
print """bzr weave tool
677
Experimental tool for weave algorithm.
681
Create an empty weave file
682
weave get WEAVEFILE VERSION
683
Write out specified version.
684
weave check WEAVEFILE
685
Check consistency of all versions.
687
Display table of contents.
688
weave add WEAVEFILE [BASE...] < NEWTEXT
689
Add NEWTEXT, with specified parent versions.
690
weave annotate WEAVEFILE VERSION
691
Display origin of each line.
692
weave mash WEAVEFILE VERSION...
693
Display composite of all selected versions.
694
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
695
Auto-merge two versions and display conflicts.
699
% weave init foo.weave
701
% weave add foo.weave < foo.txt
704
(create updated version)
706
% weave get foo.weave 0 | diff -u - foo.txt
707
% weave add foo.weave 0 < foo.txt
710
% weave get foo.weave 0 > foo.txt (create forked version)
712
% weave add foo.weave 0 < foo.txt
715
% weave merge foo.weave 1 2 > foo.txt (merge them)
716
% vi foo.txt (resolve conflicts)
717
% weave add foo.weave 1 2 < foo.txt (commit merged version)
726
from weavefile import write_weave, read_weave
727
from bzrlib.progress import ProgressBar
735
return read_weave(file(argv[2], 'rb'))
741
# at the moment, based on everything in the file
742
parents = map(int, argv[3:])
743
lines = sys.stdin.readlines()
744
ver = w.add(parents, lines)
745
write_weave(w, file(argv[2], 'wb'))
746
print 'added version %d' % ver
749
if os.path.exists(fn):
750
raise IOError("file exists")
752
write_weave(w, file(fn, 'wb'))
753
elif cmd == 'get': # get one version
755
sys.stdout.writelines(w.get_iter(int(argv[3])))
757
elif cmd == 'mash': # get composite
759
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
761
elif cmd == 'annotate':
763
# newline is added to all lines regardless; too hard to get
764
# reasonable formatting otherwise
766
for origin, text in w.annotate(int(argv[3])):
767
text = text.rstrip('\r\n')
769
print ' | %s' % (text)
771
print '%5d | %s' % (origin, text)
775
weave_info(argv[2], sys.stdout)
783
elif cmd == 'inclusions':
785
print ' '.join(map(str, w.inclusions([int(argv[3])])))
787
elif cmd == 'parents':
789
print ' '.join(map(str, w._v[int(argv[3])]))
791
elif cmd == 'plan-merge':
793
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
795
print '%14s | %s' % (state, line),
799
p = w.plan_merge(int(argv[3]), int(argv[4]))
800
sys.stdout.writelines(w.weave_merge(p))
802
elif cmd == 'mash-merge':
808
v1, v2 = map(int, argv[3:5])
810
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
812
base_lines = list(w.mash_iter(basis))
813
a_lines = list(w.get(v1))
814
b_lines = list(w.get(v2))
816
from bzrlib.merge3 import Merge3
817
m3 = Merge3(base_lines, a_lines, b_lines)
819
name_a = 'version %d' % v1
820
name_b = 'version %d' % v2
821
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
823
raise ValueError('unknown command %r' % cmd)
826
if __name__ == '__main__':
828
sys.exit(main(sys.argv))