3
# Copyright (C) 2005 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
# Author: Martin Pool <mbp@canonical.com>
22
"""Weave - storage of related text file versions"""
24
# before intset (r923) 2000 versions in 41.5s
25
# with intset (r926) 2000 versions in 93s !!!
26
# better to just use plain sets.
28
# TODO: Perhaps have copy method for Weave instances?
30
# XXX: If we do weaves this way, will a merge still behave the same
31
# way if it's done in a different order? That's a pretty desirable
34
# TODO: Nothing here so far assumes the lines are really \n newlines,
35
# rather than being split up in some other way. We could accomodate
36
# binaries, perhaps by naively splitting on \n or perhaps using
37
# something like a rolling checksum.
39
# TODO: Track version names as well as indexes.
41
# TODO: End marker for each version so we can stop reading?
43
# TODO: Check that no insertion occurs inside a deletion that was
44
# active in the version of the insertion.
46
# TODO: In addition to the SHA-1 check, perhaps have some code that
47
# checks structural constraints of the weave: ie that insertions are
48
# properly nested, that there is no text outside of an insertion, that
49
# insertions or deletions are not repeated, etc.
51
# TODO: Make the info command just show info, not extract everything:
52
# it can be much faster.
54
# TODO: Perhaps use long integers as sets instead of set objects; may
57
# TODO: Parallel-extract that passes back each line along with a
58
# description of which revisions include it. Nice for checking all
64
class WeaveError(Exception):
65
"""Exception in processing weave"""
68
class WeaveFormatError(WeaveError):
69
"""Weave invariant violated"""
73
"""weave - versioned text file storage.
75
A Weave manages versions of line-based text files, keeping track
76
of the originating version for each line.
78
To clients the "lines" of the file are represented as a list of strings.
79
These strings will typically have terminal newline characters, but
80
this is not required. In particular files commonly do not have a newline
81
at the end of the file.
83
Texts can be identified in either of two ways:
85
* a nonnegative index number.
87
* a version-id string.
89
Typically the index number will be valid only inside this weave and
90
the version-id is used to reference it in the larger world.
92
The weave is represented as a list mixing edit instructions and
93
literal text. Each entry in _l can be either a string (or
94
unicode), or a tuple. If a string, it means that the given line
95
should be output in the currently active revisions.
97
If a tuple, it gives a processing instruction saying in which
98
revisions the enclosed lines are active. The tuple has the form
99
(instruction, version).
101
The instruction can be '{' or '}' for an insertion block, and '['
102
and ']' for a deletion block respectively. The version is the
103
integer version index. There is no replace operator, only deletes
108
* A later version can delete lines that were introduced by any
109
number of ancestor versions; this implies that deletion
110
instructions can span insertion blocks without regard to the
111
insertion block's nesting.
113
* Similarly, deletions need not be properly nested with regard to
114
each other, because they might have been generated by
115
independent revisions.
117
* Insertions are always made by inserting a new bracketed block
118
into a single point in the previous weave. This implies they
119
can nest but not overlap, and the nesting must always have later
120
insertions on the inside.
122
* It doesn't seem very useful to have an active insertion
123
inside an inactive insertion, but it might happen.
125
* Therefore, all instructions are always"considered"; that
126
is passed onto and off the stack. An outer inactive block
127
doesn't disable an inner block.
129
* Lines are enabled if the most recent enclosing insertion is
130
active and none of the enclosing deletions are active.
132
* There is no point having a deletion directly inside its own
133
insertion; you might as well just not write it. And there
134
should be no way to get an earlier version deleting a later
141
List of parents, indexed by version number.
142
It is only necessary to store the minimal set of parents for
143
each version; the parent's parents are implied.
146
List of hex SHA-1 of each version, or None if not recorded.
154
def __eq__(self, other):
155
if not isinstance(other, Weave):
157
return self._v == other._v \
158
and self._l == other._l
161
def __ne__(self, other):
162
return not self.__eq__(other)
165
def add(self, parents, text):
166
"""Add a single text on top of the weave.
168
Returns the index number of the newly added version.
171
List or set of direct parent version numbers.
174
Sequence of lines to be added in the new version."""
175
## self._check_versions(parents)
176
## self._check_lines(text)
186
# TODO: It'd probably be faster to append things on to a new
187
# list rather than modifying the existing one, which is likely
188
# to cause a lot of copying.
191
ancestors = self.inclusions(parents)
192
delta = self._delta(ancestors, text)
194
# offset gives the number of lines that have been inserted
195
# into the weave up to the current point; if the original edit instruction
196
# says to change line A then we actually change (A+offset)
199
for i1, i2, newlines in delta:
202
assert i2 <= len(self._l)
204
# the deletion and insertion are handled separately.
205
# first delete the region.
207
self._l.insert(i1+offset, ('[', idx))
208
self._l.insert(i2+offset+1, (']', idx))
213
# there may have been a deletion spanning up to
214
# i2; we want to insert after this region to make sure
215
# we don't destroy ourselves
217
self._l[i:i] = [('{', idx)] \
220
offset += 2 + len(newlines)
222
self._addversion(parents)
224
# special case; adding with no parents revision; can do this
225
# more quickly by just appending unconditionally
226
self._l.append(('{', idx))
228
self._l.append(('}', idx))
230
self._addversion(None)
232
self._sha1s.append(sha1)
237
def inclusions(self, versions):
238
"""Return set of all ancestors of given version(s)."""
244
# include all its parents
249
raise ValueError("version %d not present in weave" % v)
252
def minimal_parents(self, version):
253
"""Find the minimal set of parents for the version."""
254
included = self._v[version]
259
li.sort(reverse=True)
267
gotit.update(self.inclusions(pv))
269
assert mininc[0] >= 0
270
assert mininc[-1] < version
274
def _addversion(self, parents):
276
self._v.append(parents)
278
self._v.append(set())
281
def _check_lines(self, text):
282
if not isinstance(text, list):
283
raise ValueError("text should be a list, not %s" % type(text))
286
if not isinstance(l, basestring):
287
raise ValueError("text line should be a string or unicode, not %s"
292
def _check_versions(self, indexes):
293
"""Check everything in the sequence of indexes is valid"""
298
raise IndexError("invalid version number %r" % i)
301
def annotate(self, index):
302
return list(self.annotate_iter(index))
305
def annotate_iter(self, version):
306
"""Yield list of (index-id, line) pairs for the specified version.
308
The index indicates when the line originated in the weave."""
309
for origin, lineno, text in self._extract([version]):
317
(lineno, insert, deletes, text)
318
for each literal line.
324
lineno = 0 # line of weave, 0-based
327
if isinstance(l, tuple):
340
raise WeaveFormatError('unexpected instruction %r'
343
assert isinstance(l, basestring)
345
yield lineno, istack[-1], dset, l
350
def _extract(self, versions):
351
"""Yield annotation of lines in included set.
353
Yields a sequence of tuples (origin, lineno, text), where
354
origin is the origin version, lineno the index in the weave,
355
and text the text of the line.
357
The set typically but not necessarily corresponds to a version.
359
included = self.inclusions(versions)
364
lineno = 0 # line of weave, 0-based
368
WFE = WeaveFormatError
371
if isinstance(l, tuple):
375
assert v not in istack
390
assert isinstance(l, basestring)
392
isactive = (not dset) and istack and (istack[-1] in included)
394
yield istack[-1], lineno, l
398
raise WFE("unclosed insertion blocks at end of weave",
401
raise WFE("unclosed deletion blocks at end of weave",
405
def get_iter(self, version):
406
"""Yield lines for the specified version."""
407
for origin, lineno, line in self._extract([version]):
411
def get(self, index):
412
return list(self.get_iter(index))
415
def mash_iter(self, included):
416
"""Return composed version of multiple included versions."""
417
for origin, lineno, text in self._extract(included):
421
def dump(self, to_file):
422
from pprint import pprint
423
print >>to_file, "Weave._l = ",
424
pprint(self._l, to_file)
425
print >>to_file, "Weave._v = ",
426
pprint(self._v, to_file)
430
def numversions(self):
432
assert l == len(self._sha1s)
436
def check(self, progress_bar=None):
437
# check no circular inclusions
438
for version in range(self.numversions()):
439
inclusions = list(self._v[version])
442
if inclusions[-1] >= version:
443
raise WeaveFormatError("invalid included version %d for index %d"
444
% (inclusions[-1], version))
446
# try extracting all versions; this is a bit slow and parallel
447
# extraction could be used
449
nv = self.numversions()
450
for version in range(nv):
452
progress_bar.update('checking text', version, nv)
454
for l in self.get_iter(version):
457
expected = self._sha1s[version]
459
raise WeaveError("mismatched sha1 for version %d; "
460
"got %s, expected %s"
461
% (version, hd, expected))
463
# TODO: check insertions are properly nested, that there are
464
# no lines outside of insertion blocks, that deletions are
465
# properly paired, etc.
469
def merge(self, merge_versions):
470
"""Automerge and mark conflicts between versions.
472
This returns a sequence, each entry describing alternatives
473
for a chunk of the file. Each of the alternatives is given as
476
If there is a chunk of the file where there's no diagreement,
477
only one alternative is given.
480
# approach: find the included versions common to all the
482
raise NotImplementedError()
486
def _delta(self, included, lines):
487
"""Return changes from basis to new revision.
489
The old text for comparison is the union of included revisions.
491
This is used in inserting a new text.
493
Delta is returned as a sequence of
494
(weave1, weave2, newlines).
496
This indicates that weave1:weave2 of the old weave should be
497
replaced by the sequence of lines in newlines. Note that
498
these line numbers are positions in the total weave and don't
499
correspond to the lines in any extracted version, or even the
500
extracted union of included versions.
502
If line1=line2, this is a pure insert; if newlines=[] this is a
503
pure delete. (Similar to difflib.)
505
# basis a list of (origin, lineno, line)
508
for origin, lineno, line in self._extract(included):
509
basis_lineno.append(lineno)
510
basis_lines.append(line)
512
# add a sentinal, because we can also match against the final line
513
basis_lineno.append(len(self._l))
515
# XXX: which line of the weave should we really consider
516
# matches the end of the file? the current code says it's the
517
# last line of the weave?
519
from difflib import SequenceMatcher
520
s = SequenceMatcher(None, basis_lines, lines)
522
# TODO: Perhaps return line numbers from composed weave as well?
524
for tag, i1, i2, j1, j2 in s.get_opcodes():
525
##print tag, i1, i2, j1, j2
530
# i1,i2 are given in offsets within basis_lines; we need to map them
531
# back to offsets within the entire weave
532
real_i1 = basis_lineno[i1]
533
real_i2 = basis_lineno[i2]
537
assert j2 <= len(lines)
539
yield real_i1, real_i2, lines[j1:j2]
543
def plan_merge(self, ver_a, ver_b):
544
"""Return pseudo-annotation indicating how the two versions merge.
546
This is computed between versions a and b and their common
549
Weave lines present in none of them are skipped entirely.
551
inc_a = self.inclusions([ver_a])
552
inc_b = self.inclusions([ver_b])
553
inc_c = inc_a & inc_b
555
for lineno, insert, deleteset, line in self._walk():
556
if deleteset & inc_c:
557
# killed in parent; can't be in either a or b
558
# not relevant to our work
559
yield 'killed-base', line
560
elif insert in inc_c:
561
# was inserted in base
562
killed_a = bool(deleteset & inc_a)
563
killed_b = bool(deleteset & inc_b)
564
if killed_a and killed_b:
565
yield 'killed-both', line
567
yield 'killed-a', line
569
yield 'killed-b', line
571
yield 'unchanged', line
572
elif insert in inc_a:
573
if deleteset & inc_a:
574
yield 'ghost-a', line
578
elif insert in inc_b:
579
if deleteset & inc_b:
580
yield 'ghost-b', line
584
# not in either revision
585
yield 'irrelevant', line
587
yield 'unchanged', '' # terminator
591
def weave_merge(self, plan):
596
for state, line in plan:
597
if state == 'unchanged' or state == 'killed-both':
598
# resync and flush queued conflicts changes if any
599
if not lines_a and not lines_b:
601
elif ch_a and not ch_b:
603
for l in lines_a: yield l
604
elif ch_b and not ch_a:
605
for l in lines_b: yield l
606
elif lines_a == lines_b:
607
for l in lines_a: yield l
610
for l in lines_a: yield l
612
for l in lines_b: yield l
619
if state == 'unchanged':
622
elif state == 'killed-a':
625
elif state == 'killed-b':
628
elif state == 'new-a':
631
elif state == 'new-b':
635
assert state in ('irrelevant', 'ghost-a', 'ghost-b', 'killed-base',
645
def weave_info(filename, out):
646
"""Show some text information about the weave."""
647
from weavefile import read_weave
648
wf = file(filename, 'rb')
650
# FIXME: doesn't work on pipes
651
weave_size = wf.tell()
652
print >>out, "weave file size %d bytes" % weave_size
653
print >>out, "weave contains %d versions" % len(w._v)
656
print '%6s %6s %8s %40s %20s' % ('ver', 'lines', 'bytes', 'sha1', 'parents')
657
for i in (6, 6, 8, 40, 20):
660
for i in range(len(w._v)):
663
bytes = sum((len(a) for a in text))
665
print '%6d %6d %8d %40s' % (i, lines, bytes, sha1),
671
print >>out, "versions total %d bytes" % total
672
print >>out, "compression ratio %.3f" % (float(total)/float(weave_size))
676
print """bzr weave tool
678
Experimental tool for weave algorithm.
682
Create an empty weave file
683
weave get WEAVEFILE VERSION
684
Write out specified version.
685
weave check WEAVEFILE
686
Check consistency of all versions.
688
Display table of contents.
689
weave add WEAVEFILE [BASE...] < NEWTEXT
690
Add NEWTEXT, with specified parent versions.
691
weave annotate WEAVEFILE VERSION
692
Display origin of each line.
693
weave mash WEAVEFILE VERSION...
694
Display composite of all selected versions.
695
weave merge WEAVEFILE VERSION1 VERSION2 > OUT
696
Auto-merge two versions and display conflicts.
700
% weave init foo.weave
702
% weave add foo.weave < foo.txt
705
(create updated version)
707
% weave get foo.weave 0 | diff -u - foo.txt
708
% weave add foo.weave 0 < foo.txt
711
% weave get foo.weave 0 > foo.txt (create forked version)
713
% weave add foo.weave 0 < foo.txt
716
% weave merge foo.weave 1 2 > foo.txt (merge them)
717
% vi foo.txt (resolve conflicts)
718
% weave add foo.weave 1 2 < foo.txt (commit merged version)
727
from weavefile import write_weave, read_weave
728
from bzrlib.progress import ProgressBar
736
return read_weave(file(argv[2], 'rb'))
742
# at the moment, based on everything in the file
743
parents = map(int, argv[3:])
744
lines = sys.stdin.readlines()
745
ver = w.add(parents, lines)
746
write_weave(w, file(argv[2], 'wb'))
747
print 'added version %d' % ver
750
if os.path.exists(fn):
751
raise IOError("file exists")
753
write_weave(w, file(fn, 'wb'))
754
elif cmd == 'get': # get one version
756
sys.stdout.writelines(w.get_iter(int(argv[3])))
758
elif cmd == 'mash': # get composite
760
sys.stdout.writelines(w.mash_iter(map(int, argv[3:])))
762
elif cmd == 'annotate':
764
# newline is added to all lines regardless; too hard to get
765
# reasonable formatting otherwise
767
for origin, text in w.annotate(int(argv[3])):
768
text = text.rstrip('\r\n')
770
print ' | %s' % (text)
772
print '%5d | %s' % (origin, text)
776
weave_info(argv[2], sys.stdout)
784
elif cmd == 'inclusions':
786
print ' '.join(map(str, w.inclusions([int(argv[3])])))
788
elif cmd == 'parents':
790
print ' '.join(map(str, w._v[int(argv[3])]))
792
elif cmd == 'plan-merge':
794
for state, line in w.plan_merge(int(argv[3]), int(argv[4])):
796
print '%14s | %s' % (state, line),
800
p = w.plan_merge(int(argv[3]), int(argv[4]))
801
sys.stdout.writelines(w.weave_merge(p))
803
elif cmd == 'mash-merge':
809
v1, v2 = map(int, argv[3:5])
811
basis = w.inclusions([v1]).intersection(w.inclusions([v2]))
813
base_lines = list(w.mash_iter(basis))
814
a_lines = list(w.get(v1))
815
b_lines = list(w.get(v2))
817
from bzrlib.merge3 import Merge3
818
m3 = Merge3(base_lines, a_lines, b_lines)
820
name_a = 'version %d' % v1
821
name_b = 'version %d' % v2
822
sys.stdout.writelines(m3.merge_lines(name_a=name_a, name_b=name_b))
824
raise ValueError('unknown command %r' % cmd)
827
if __name__ == '__main__':
829
sys.exit(main(sys.argv))