1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
"""Knit versionedfile implementation.
22
A knit is a versioned file implementation that supports efficient append only
28
from difflib import SequenceMatcher
30
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
31
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
32
RevisionNotPresent, RevisionAlreadyPresent
33
from bzrlib.trace import mutter
34
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
36
from bzrlib.versionedfile import VersionedFile
37
from bzrlib.tsort import topo_sort
39
from StringIO import StringIO
40
from gzip import GzipFile
43
# TODO: Split out code specific to this format into an associated object.
45
# TODO: Can we put in some kind of value to check that the index and data
46
# files belong together?
48
# TODO: accomodate binaries, perhaps by storing a byte count
50
# TODO: function to check whole file
52
# TODO: atomically append data, then measure backwards from the cursor
53
# position after writing to work out where it was located. we may need to
54
# bypass python file buffering.
57
INDEX_SUFFIX = '.kndx'
60
class KnitContent(object):
61
"""Content of a knit version to which deltas can be applied."""
63
def __init__(self, lines):
66
def annotate_iter(self):
67
"""Yield tuples of (origin, text) for each content line."""
68
for origin, text in self._lines:
72
"""Return a list of (origin, text) tuples."""
73
return list(self.annotate_iter())
75
def apply_delta(self, delta):
76
"""Apply delta to this content."""
78
for start, end, count, lines in delta:
79
self._lines[offset+start:offset+end] = lines
80
offset = offset + (start - end) + count
82
def line_delta_iter(self, new_lines):
83
"""Generate line-based delta from new_lines to this content."""
84
new_texts = [text for origin, text in new_lines._lines]
85
old_texts = [text for origin, text in self._lines]
86
s = difflib.SequenceMatcher(None, old_texts, new_texts)
87
for op in s.get_opcodes():
90
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
92
def line_delta(self, new_lines):
93
return list(self.line_delta_iter(new_lines))
96
return [text for origin, text in self._lines]
99
class _KnitFactory(object):
100
"""Base factory for creating content objects."""
102
def make(self, lines, version):
103
num_lines = len(lines)
104
return KnitContent(zip([version] * num_lines, lines))
107
class KnitAnnotateFactory(_KnitFactory):
108
"""Factory for creating annotated Content objects."""
112
def parse_fulltext(self, content, version):
115
origin, text = line.split(' ', 1)
116
lines.append((int(origin), text))
117
return KnitContent(lines)
119
def parse_line_delta_iter(self, lines):
121
header = lines.pop(0)
122
start, end, c = [int(n) for n in header.split(',')]
125
origin, text = lines.pop(0).split(' ', 1)
126
contents.append((int(origin), text))
127
yield start, end, c, contents
129
def parse_line_delta(self, lines, version):
130
return list(self.parse_line_delta_iter(lines))
132
def lower_fulltext(self, content):
133
return ['%d %s' % (o, t) for o, t in content._lines]
135
def lower_line_delta(self, delta):
137
for start, end, c, lines in delta:
138
out.append('%d,%d,%d\n' % (start, end, c))
139
for origin, text in lines:
140
out.append('%d %s' % (origin, text))
144
class KnitPlainFactory(_KnitFactory):
145
"""Factory for creating plain Content objects."""
149
def parse_fulltext(self, content, version):
150
return self.make(content, version)
152
def parse_line_delta_iter(self, lines, version):
154
header = lines.pop(0)
155
start, end, c = [int(n) for n in header.split(',')]
156
yield start, end, c, zip([version] * c, lines[:c])
159
def parse_line_delta(self, lines, version):
160
return list(self.parse_line_delta_iter(lines, version))
162
def lower_fulltext(self, content):
163
return content.text()
165
def lower_line_delta(self, delta):
167
for start, end, c, lines in delta:
168
out.append('%d,%d,%d\n' % (start, end, c))
169
out.extend([text for origin, text in lines])
173
def make_empty_knit(transport, relpath):
174
"""Construct a empty knit at the specified location."""
175
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
179
class KnitVersionedFile(VersionedFile):
180
"""Weave-like structure with faster random access.
182
A knit stores a number of texts and a summary of the relationships
183
between them. Texts are identified by a string version-id. Texts
184
are normally stored and retrieved as a series of lines, but can
185
also be passed as single strings.
187
Lines are stored with the trailing newline (if any) included, to
188
avoid special cases for files with no final newline. Lines are
189
composed of 8-bit characters, not unicode. The combination of
190
these approaches should mean any 'binary' file can be safely
191
stored and retrieved.
194
def __init__(self, transport, relpath, mode, factory,
195
basis_knit=None, delta=True):
196
"""Construct a knit at location specified by relpath."""
197
assert mode in ('r', 'w'), "invalid mode specified"
198
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
201
self.transport = transport
202
self.filename = relpath
203
self.basis_knit = basis_knit
204
self.factory = factory
205
self.writable = (mode == 'w')
208
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
210
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
214
"""See VersionedFile.versions."""
215
return self._index.get_versions()
217
def has_version(self, version_id):
218
"""See VersionedFile.has_version."""
219
return self._index.has_version(version_id)
221
__contains__ = has_version
223
def _merge_annotations(self, content, parents):
224
"""Merge annotations for content. This is done by comparing
225
the annotations based on changed to the text."""
226
for parent_id in parents:
227
merge_content = self._get_content(parent_id)
228
seq = SequenceMatcher(None, merge_content.text(), content.text())
229
for i, j, n in seq.get_matching_blocks():
232
content._lines[j:j+n] = merge_content._lines[i:i+n]
234
def _get_components(self, version_id):
235
"""Return a list of (version_id, method, data) tuples that
236
makes up version specified by version_id of the knit.
238
The components should be applied in the order of the returned
241
The basis knit will be used to the largest extent possible
242
since it is assumed that accesses to it is faster.
244
# needed_revisions holds a list of (method, version_id) of
245
# versions that is needed to be fetched to construct the final
246
# version of the file.
248
# basis_revisions is a list of versions that needs to be
249
# fetched but exists in the basis knit.
251
basis = self.basis_knit
258
if basis and basis._index.has_version(cursor):
260
basis_versions.append(cursor)
261
method = picked_knit._index.get_method(cursor)
262
needed_versions.append((method, cursor))
263
if method == 'fulltext':
265
cursor = picked_knit.get_parents(cursor)[0]
270
for comp_id in basis_versions:
271
data_pos, data_size = basis._index.get_data_position(comp_id)
272
records.append((piece_id, data_pos, data_size))
273
components.update(basis._data.read_records(records))
276
for comp_id in [vid for method, vid in needed_versions
277
if vid not in basis_versions]:
278
data_pos, data_size = self._index.get_position(comp_id)
279
records.append((comp_id, data_pos, data_size))
280
components.update(self._data.read_records(records))
282
# get_data_records returns a mapping with the version id as
283
# index and the value as data. The order the components need
284
# to be applied is held by needed_versions (reversed).
286
for method, comp_id in reversed(needed_versions):
287
out.append((comp_id, method, components[comp_id]))
291
def _get_content(self, version_id):
292
"""Returns a content object that makes up the specified
294
if not self.has_version(version_id):
295
raise RevisionNotPresent(version_id, self.filename)
297
if self.basis_knit and version_id in self.basis_knit:
298
return self.basis_knit._get_content(version_id)
301
components = self._get_components(version_id)
302
for component_id, method, (data, digest) in components:
303
version_idx = self._index.lookup(component_id)
304
if method == 'fulltext':
305
assert content is None
306
content = self.factory.parse_fulltext(data, version_idx)
307
elif method == 'line-delta':
308
delta = self.factory.parse_line_delta(data, version_idx)
309
content.apply_delta(delta)
311
if 'no-eol' in self._index.get_options(version_id):
312
line = content._lines[-1][1].rstrip('\n')
313
content._lines[-1] = (content._lines[-1][0], line)
315
if sha_strings(content.text()) != digest:
316
raise KnitCorrupt(self.filename, 'sha-1 does not match')
320
def _check_versions_present(self, version_ids):
321
"""Check that all specified versions are present."""
322
version_ids = set(version_ids)
323
for r in list(version_ids):
324
if self._index.has_version(r):
325
version_ids.remove(r)
327
raise RevisionNotPresent(list(version_ids)[0], self.filename)
329
def add_lines(self, version_id, parents, lines):
330
"""See VersionedFile.add_lines."""
331
assert self.writable, "knit is not opened for write"
332
### FIXME escape. RBC 20060228
333
if contains_whitespace(version_id):
334
raise InvalidRevisionId(version_id)
335
if self.has_version(version_id):
336
raise RevisionAlreadyPresent(version_id, self.filename)
338
if True or __debug__:
340
assert '\n' not in l[:-1]
342
self._check_versions_present(parents)
343
return self._add(version_id, lines[:], parents, self.delta)
345
def _add(self, version_id, lines, parents, delta):
346
"""Add a set of lines on top of version specified by parents.
348
If delta is true, compress the text as a line-delta against
351
if delta and not parents:
354
digest = sha_strings(lines)
357
if lines[-1][-1] != '\n':
358
options.append('no-eol')
359
lines[-1] = lines[-1] + '\n'
361
lines = self.factory.make(lines, len(self._index))
362
if self.factory.annotated and len(parents) > 0:
363
# Merge annotations from parent texts if so is needed.
364
self._merge_annotations(lines, parents)
366
if parents and delta:
367
# To speed the extract of texts the delta chain is limited
368
# to a fixed number of deltas. This should minimize both
369
# I/O and the time spend applying deltas.
371
delta_parents = parents
373
parent = delta_parents[0]
374
method = self._index.get_method(parent)
375
if method == 'fulltext':
377
delta_parents = self._index.get_parents(parent)
379
if method == 'line-delta':
383
options.append('line-delta')
384
content = self._get_content(parents[0])
385
delta_hunks = content.line_delta(lines)
386
store_lines = self.factory.lower_line_delta(delta_hunks)
388
options.append('fulltext')
389
store_lines = self.factory.lower_fulltext(lines)
391
where, size = self._data.add_record(version_id, digest, store_lines)
392
self._index.add_version(version_id, options, where, size, parents)
394
def clone_text(self, new_version_id, old_version_id, parents):
395
"""See VersionedFile.clone_text()."""
396
# FIXME RBC 20060228 make fast by only inserting an index with null delta.
397
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
399
def get_lines(self, version_id):
400
"""See VersionedFile.get_lines()."""
401
return self._get_content(version_id).text()
403
def annotate_iter(self, version_id):
404
"""See VersionedFile.annotate_iter."""
405
content = self._get_content(version_id)
406
for origin, text in content.annotate_iter():
407
yield self._index.idx_to_name(origin), text
409
def get_parents(self, version_id):
410
"""See VersionedFile.get_parents."""
411
self._check_versions_present([version_id])
412
return list(self._index.get_parents(version_id))
414
def get_ancestry(self, versions):
415
"""See VersionedFile.get_ancestry."""
416
if isinstance(versions, basestring):
417
versions = [versions]
420
self._check_versions_present(versions)
421
return self._index.get_ancestry(versions)
423
def _reannotate_line_delta(self, other, lines, new_version_id,
425
"""Re-annotate line-delta and return new delta."""
427
for start, end, count, contents \
428
in self.factory.parse_line_delta_iter(lines):
430
for origin, line in contents:
431
old_version_id = other._index.idx_to_name(origin)
432
if old_version_id == new_version_id:
433
idx = new_version_idx
435
idx = self._index.lookup(old_version_id)
436
new_lines.append((idx, line))
437
new_delta.append((start, end, count, new_lines))
439
return self.factory.lower_line_delta(new_delta)
441
def _reannotate_fulltext(self, other, lines, new_version_id,
443
"""Re-annotate fulltext and return new version."""
444
content = self.factory.parse_fulltext(lines, new_version_idx)
446
for origin, line in content.annotate_iter():
447
old_version_id = other._index.idx_to_name(origin)
448
if old_version_id == new_version_id:
449
idx = new_version_idx
451
idx = self._index.lookup(old_version_id)
452
new_lines.append((idx, line))
454
return self.factory.lower_fulltext(KnitContent(new_lines))
456
def join(self, other, pb=None, msg=None, version_ids=None):
457
"""See VersionedFile.join."""
458
assert isinstance(other, KnitVersionedFile)
460
if version_ids is None:
461
version_ids = other.versions()
466
from bzrlib.progress import DummyProgress
469
version_ids = list(version_ids)
470
if None in version_ids:
471
version_ids.remove(None)
473
other_ancestry = set(other.get_ancestry(version_ids))
474
needed_versions = other_ancestry - set(self._index.get_versions())
475
if not needed_versions:
477
full_list = topo_sort(other._index.get_graph())
479
version_list = [i for i in full_list if (not self.has_version(i)
480
and i in needed_versions)]
483
for version_id in version_list:
484
data_pos, data_size = other._index.get_position(version_id)
485
records.append((version_id, data_pos, data_size))
488
for version_id, lines, digest \
489
in other._data.read_records_iter(records):
490
options = other._index.get_options(version_id)
491
parents = other._index.get_parents(version_id)
493
for parent in parents:
494
assert self.has_version(parent)
496
if self.factory.annotated:
497
# FIXME jrydberg: it should be possible to skip
498
# re-annotating components if we know that we are
499
# going to pull all revisions in the same order.
500
new_version_id = version_id
501
new_version_idx = self._index.num_versions()
502
if 'fulltext' in options:
503
lines = self._reannotate_fulltext(other, lines,
504
new_version_id, new_version_idx)
505
elif 'line-delta' in options:
506
lines = self._reannotate_line_delta(other, lines,
507
new_version_id, new_version_idx)
510
pb.update(self.filename, count, len(version_list))
512
pos, size = self._data.add_record(version_id, digest, lines)
513
self._index.add_version(version_id, options, pos, size, parents)
518
def walk(self, version_ids):
519
"""See VersionedFile.walk."""
520
# We take the short path here, and extract all relevant texts
521
# and put them in a weave and let that do all the work. Far
522
# from optimal, but is much simpler.
523
from bzrlib.weave import Weave
525
w = Weave(self.filename)
526
ancestry = self.get_ancestry(version_ids)
527
sorted_graph = topo_sort(self._index.get_graph())
528
version_list = [vid for vid in sorted_graph if vid in ancestry]
530
for version_id in version_list:
531
lines = self.get_lines(version_id)
532
w.add_lines(version_id, self.get_parents(version_id), lines)
534
for lineno, insert_id, dset, line in w.walk(version_ids):
535
yield lineno, insert_id, dset, line
538
class _KnitComponentFile(object):
539
"""One of the files used to implement a knit database"""
541
def __init__(self, transport, filename, mode):
542
self._transport = transport
543
self._filename = filename
546
def write_header(self):
547
old_len = self._transport.append(self._filename, self.HEADER)
549
raise KnitCorrupt(self._filename, 'misaligned after writing header')
551
def check_header(self, fp):
552
line = fp.read(len(self.HEADER))
553
if line != self.HEADER:
554
raise KnitHeaderError(badline=line)
557
"""Commit is a nop."""
560
return '%s(%s)' % (self.__class__.__name__, self._filename)
563
class _KnitIndex(_KnitComponentFile):
564
"""Manages knit index file.
566
The index is already kept in memory and read on startup, to enable
567
fast lookups of revision information. The cursor of the index
568
file is always pointing to the end, making it easy to append
571
_cache is a cache for fast mapping from version id to a Index
574
_history is a cache for fast mapping from indexes to version ids.
576
The index data format is dictionary compressed when it comes to
577
parent references; a index entry may only have parents that with a
578
lover index number. As a result, the index is topological sorted.
581
HEADER = "# bzr knit index 7\n"
583
def _cache_version(self, version_id, options, pos, size, parents):
584
val = (version_id, options, pos, size, parents)
585
self._cache[version_id] = val
586
self._history.append(version_id)
588
def _iter_index(self, fp):
590
for l in lines.splitlines(False):
593
def __init__(self, transport, filename, mode):
594
_KnitComponentFile.__init__(self, transport, filename, mode)
598
fp = self._transport.get(self._filename)
599
self.check_header(fp)
600
for rec in self._iter_index(fp):
601
self._cache_version(rec[0], rec[1].split(','), int(rec[2]), int(rec[3]),
602
[self._history[int(i)] for i in rec[4:]])
603
except NoSuchFile, e:
610
for version_id, index in self._cache.iteritems():
611
graph.append((version_id, index[4]))
614
def get_ancestry(self, versions):
615
"""See VersionedFile.get_ancestry."""
617
for version_id in versions:
618
version_idxs.append(self._history.index(version_id))
620
for v in xrange(max(version_idxs), 0, -1):
621
if self._history[v] in i:
622
# include all its parents
623
i.update(self._cache[self._history[v]][4])
626
def num_versions(self):
627
return len(self._history)
629
__len__ = num_versions
631
def get_versions(self):
634
def idx_to_name(self, idx):
635
return self._history[idx]
637
def lookup(self, version_id):
638
assert version_id in self._cache
639
return self._history.index(version_id)
641
def add_version(self, version_id, options, pos, size, parents):
642
"""Add a version record to the index."""
643
self._cache_version(version_id, options, pos, size, parents)
645
content = "%s %s %s %s %s\n" % (version_id,
649
' '.join([str(self.lookup(vid)) for
651
self._transport.append(self._filename, content)
653
def has_version(self, version_id):
654
"""True if the version is in the index."""
655
return self._cache.has_key(version_id)
657
def get_position(self, version_id):
658
"""Return data position and size of specified version."""
659
return (self._cache[version_id][2], \
660
self._cache[version_id][3])
662
def get_method(self, version_id):
663
"""Return compression method of specified version."""
664
options = self._cache[version_id][1]
665
if 'fulltext' in options:
668
assert 'line-delta' in options
671
def get_options(self, version_id):
672
return self._cache[version_id][1]
674
def get_parents(self, version_id):
675
"""Return parents of specified version."""
676
return self._cache[version_id][4]
678
def check_versions_present(self, version_ids):
679
"""Check that all specified versions are present."""
680
version_ids = set(version_ids)
681
for version_id in list(version_ids):
682
if version_id in self._cache:
683
version_ids.remove(version_id)
685
raise RevisionNotPresent(list(version_ids)[0], self.filename)
688
class _KnitData(_KnitComponentFile):
689
"""Contents of the knit data file"""
691
HEADER = "# bzr knit data 7\n"
693
def __init__(self, transport, filename, mode):
694
_KnitComponentFile.__init__(self, transport, filename, mode)
696
self._checked = False
698
def _open_file(self):
699
if self._file is None:
701
self._file = self._transport.get(self._filename)
706
def add_record(self, version_id, digest, lines):
707
"""Write new text record to disk. Returns the position in the
708
file where it was written."""
710
data_file = GzipFile(None, mode='wb', fileobj=sio)
711
print >>data_file, "version %s %d %s" % (version_id, len(lines), digest)
712
data_file.writelines(lines)
713
print >>data_file, "end %s\n" % version_id
716
content = sio.getvalue()
717
start_pos = self._transport.append(self._filename, content)
718
return start_pos, len(content)
720
def _parse_record(self, version_id, data):
721
df = GzipFile(mode='rb', fileobj=StringIO(data))
722
rec = df.readline().split()
724
raise KnitCorrupt(self._filename, 'unexpected number of records')
725
if rec[1] != version_id:
726
raise KnitCorrupt(self.file.name,
727
'unexpected version, wanted %r' % version_id)
729
record_contents = self._read_record_contents(df, lines)
731
if l != 'end %s\n' % version_id:
732
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
734
return record_contents, rec[3]
736
def _read_record_contents(self, df, record_lines):
737
"""Read and return n lines from datafile."""
739
for i in range(record_lines):
740
r.append(df.readline())
743
def read_records_iter(self, records):
744
"""Read text records from data file and yield result.
746
Each passed record is a tuple of (version_id, pos, len) and
747
will be read in the given order. Yields (version_id,
751
class ContinuousRange:
752
def __init__(self, rec_id, pos, size):
754
self.end_pos = pos + size
755
self.versions = [(rec_id, pos, size)]
757
def add(self, rec_id, pos, size):
758
if self.end_pos != pos:
760
self.end_pos = pos + size
761
self.versions.append((rec_id, pos, size))
765
for rec_id, pos, size in self.versions:
766
yield rec_id, fp.read(size)
768
fp = self._open_file()
770
# Loop through all records and try to collect as large
771
# continuous region as possible to read.
773
record_id, pos, size = records.pop(0)
774
continuous_range = ContinuousRange(record_id, pos, size)
776
record_id, pos, size = records[0]
777
if continuous_range.add(record_id, pos, size):
781
fp.seek(continuous_range.start_pos, 0)
782
for record_id, data in continuous_range.split(fp):
783
content, digest = self._parse_record(record_id, data)
784
yield record_id, content, digest
788
def read_records(self, records):
789
"""Read records into a dictionary."""
791
for record_id, content, digest in self.read_records_iter(records):
792
components[record_id] = (content, digest)