1
# Copyright (C) 2005, 2006 by Canonical Ltd
2
# Written by Martin Pool.
3
# Modified by Johan Rydberg <jrydberg@gnu.org>
4
# Modified by Robert Collins <robert.collins@canonical.com>
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation; either version 2 of the License, or
9
# (at your option) any later version.
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
"""Knit versionedfile implementation.
22
A knit is a versioned file implementation that supports efficient append only
26
lifeless: the data file is made up of "delta records". each delta record has a delta header
27
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
28
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
29
end-marker; simply "end VERSION"
31
delta can be line or full contents.a
32
... the 8's there are the index number of the annotation.
33
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
37
8 e.set('executable', 'yes')
39
8 if elt.get('executable') == 'yes':
40
8 ie.executable = True
41
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
45
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
46
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
47
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
48
09:33 < lifeless> right
49
09:33 < jrydberg> lifeless: the position and size is the range in the data file
52
so the index sequence is the dictionary compressed sequence number used
53
in the deltas to provide line annotation
58
# 10:16 < lifeless> make partial index writes safe
59
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
60
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
62
# move sha1 out of the content so that join is faster at verifying parents
63
# record content length ?
67
from cStringIO import StringIO
70
from itertools import izip, chain
75
import bzrlib.errors as errors
76
from bzrlib.errors import FileExists, NoSuchFile, KnitError, \
77
InvalidRevisionId, KnitCorrupt, KnitHeaderError, \
78
RevisionNotPresent, RevisionAlreadyPresent
79
from bzrlib.trace import mutter
80
from bzrlib.osutils import contains_whitespace, contains_linebreaks, \
82
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
83
from bzrlib.tsort import topo_sort
86
# TODO: Split out code specific to this format into an associated object.
88
# TODO: Can we put in some kind of value to check that the index and data
89
# files belong together?
91
# TODO: accomodate binaries, perhaps by storing a byte count
93
# TODO: function to check whole file
95
# TODO: atomically append data, then measure backwards from the cursor
96
# position after writing to work out where it was located. we may need to
97
# bypass python file buffering.
100
INDEX_SUFFIX = '.kndx'
103
class KnitContent(object):
104
"""Content of a knit version to which deltas can be applied."""
106
def __init__(self, lines):
109
def annotate_iter(self):
110
"""Yield tuples of (origin, text) for each content line."""
111
for origin, text in self._lines:
115
"""Return a list of (origin, text) tuples."""
116
return list(self.annotate_iter())
118
def apply_delta(self, delta):
119
"""Apply delta to this content."""
121
for start, end, count, lines in delta:
122
self._lines[offset+start:offset+end] = lines
123
offset = offset + (start - end) + count
125
def line_delta_iter(self, new_lines):
126
"""Generate line-based delta from this content to new_lines."""
127
new_texts = [text for origin, text in new_lines._lines]
128
old_texts = [text for origin, text in self._lines]
129
s = SequenceMatcher(None, old_texts, new_texts)
130
for op in s.get_opcodes():
133
yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])
135
def line_delta(self, new_lines):
136
return list(self.line_delta_iter(new_lines))
139
return [text for origin, text in self._lines]
142
class _KnitFactory(object):
143
"""Base factory for creating content objects."""
145
def make(self, lines, version):
146
num_lines = len(lines)
147
return KnitContent(zip([version] * num_lines, lines))
150
class KnitAnnotateFactory(_KnitFactory):
151
"""Factory for creating annotated Content objects."""
155
def parse_fulltext(self, content, version):
156
"""Convert fulltext to internal representation
158
fulltext content is of the format
159
revid(utf8) plaintext\n
160
internal representation is of the format:
165
origin, text = line.split(' ', 1)
166
lines.append((origin.decode('utf-8'), text))
167
return KnitContent(lines)
169
def parse_line_delta_iter(self, lines):
170
"""Convert a line based delta into internal representation.
172
line delta is in the form of:
173
intstart intend intcount
175
revid(utf8) newline\n
176
internal represnetation is
177
(start, end, count, [1..count tuples (revid, newline)])
180
header = lines.pop(0)
181
start, end, c = [int(n) for n in header.split(',')]
184
origin, text = lines.pop(0).split(' ', 1)
185
contents.append((origin.decode('utf-8'), text))
186
yield start, end, c, contents
188
def parse_line_delta(self, lines, version):
189
return list(self.parse_line_delta_iter(lines))
191
def lower_fulltext(self, content):
192
"""convert a fulltext content record into a serializable form.
194
see parse_fulltext which this inverts.
196
return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]
198
def lower_line_delta(self, delta):
199
"""convert a delta into a serializable form.
201
See parse_line_delta_iter which this inverts.
204
for start, end, c, lines in delta:
205
out.append('%d,%d,%d\n' % (start, end, c))
206
for origin, text in lines:
207
out.append('%s %s' % (origin.encode('utf-8'), text))
211
class KnitPlainFactory(_KnitFactory):
212
"""Factory for creating plain Content objects."""
216
def parse_fulltext(self, content, version):
217
"""This parses an unannotated fulltext.
219
Note that this is not a noop - the internal representation
220
has (versionid, line) - its just a constant versionid.
222
return self.make(content, version)
224
def parse_line_delta_iter(self, lines, version):
226
header = lines.pop(0)
227
start, end, c = [int(n) for n in header.split(',')]
228
yield start, end, c, zip([version] * c, lines[:c])
231
def parse_line_delta(self, lines, version):
232
return list(self.parse_line_delta_iter(lines, version))
234
def lower_fulltext(self, content):
235
return content.text()
237
def lower_line_delta(self, delta):
239
for start, end, c, lines in delta:
240
out.append('%d,%d,%d\n' % (start, end, c))
241
out.extend([text for origin, text in lines])
245
def make_empty_knit(transport, relpath):
246
"""Construct a empty knit at the specified location."""
247
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
251
class KnitVersionedFile(VersionedFile):
252
"""Weave-like structure with faster random access.
254
A knit stores a number of texts and a summary of the relationships
255
between them. Texts are identified by a string version-id. Texts
256
are normally stored and retrieved as a series of lines, but can
257
also be passed as single strings.
259
Lines are stored with the trailing newline (if any) included, to
260
avoid special cases for files with no final newline. Lines are
261
composed of 8-bit characters, not unicode. The combination of
262
these approaches should mean any 'binary' file can be safely
263
stored and retrieved.
266
def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,
267
basis_knit=None, delta=True, create=False):
268
"""Construct a knit at location specified by relpath.
270
:param create: If not True, only open an existing knit.
272
if access_mode is None:
274
super(KnitVersionedFile, self).__init__(access_mode)
275
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
276
assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \
279
self.transport = transport
280
self.filename = relpath
281
self.basis_knit = basis_knit
282
self.factory = factory or KnitAnnotateFactory()
283
self.writable = (access_mode == 'w')
286
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
287
access_mode, create=create)
288
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
289
access_mode, create=not len(self.versions()))
291
def clear_cache(self):
292
"""Clear the data cache only."""
293
self._data.clear_cache()
295
def copy_to(self, name, transport):
296
"""See VersionedFile.copy_to()."""
297
# copy the current index to a temp index to avoid racing with local
299
transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))
301
transport.put(name + DATA_SUFFIX, self._data._open_file())
302
# rename the copied index into place
303
transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
305
def create_empty(self, name, transport, mode=None):
306
return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)
308
def _fix_parents(self, version, new_parents):
309
"""Fix the parents list for version.
311
This is done by appending a new version to the index
312
with identical data except for the parents list.
313
the parents list must be a superset of the current
316
current_values = self._index._cache[version]
317
assert set(current_values[4]).difference(set(new_parents)) == set()
318
self._index.add_version(version,
324
def get_graph_with_ghosts(self):
325
"""See VersionedFile.get_graph_with_ghosts()."""
326
graph_items = self._index.get_graph()
327
return dict(graph_items)
331
"""See VersionedFile.get_suffixes()."""
332
return [DATA_SUFFIX, INDEX_SUFFIX]
334
def has_ghost(self, version_id):
335
"""True if there is a ghost reference in the file to version_id."""
337
if self.has_version(version_id):
339
# optimisable if needed by memoising the _ghosts set.
340
items = self._index.get_graph()
341
for node, parents in items:
342
for parent in parents:
343
if parent not in self._index._cache:
344
if parent == version_id:
349
"""See VersionedFile.versions."""
350
return self._index.get_versions()
352
def has_version(self, version_id):
353
"""See VersionedFile.has_version."""
354
return self._index.has_version(version_id)
356
__contains__ = has_version
358
def _merge_annotations(self, content, parents, parent_texts={},
359
delta=None, annotated=None):
360
"""Merge annotations for content. This is done by comparing
361
the annotations based on changed to the text.
365
reference_content = self._get_content(parents[0], parent_texts)
366
new_texts = [text for origin, text in content._lines]
367
old_texts = [text for origin, text in reference_content._lines]
368
delta_seq = SequenceMatcher(None, old_texts, new_texts)
369
for op in delta_seq.get_opcodes():
372
diff_hunks.append((op[1], op[2], op[4]-op[3], content._lines[op[3]:op[4]]))
374
# reuse the delta_sequencer if possible:
376
for i, j, n in delta_seq.get_matching_blocks():
379
# this appears to copy (origin, text) pairs across to the new
380
# content for any line that matches the last-checked parent.
381
# FIXME: save the sequence control data for delta compression
382
# against the most relevant parent rather than rediffing.
383
content._lines[j:j+n] = reference_content._lines[i:i+n]
384
for parent_id in parents[1:]:
385
merge_content = self._get_content(parent_id, parent_texts)
386
seq = SequenceMatcher(None, merge_content.text(), content.text())
387
for i, j, n in seq.get_matching_blocks():
390
# this appears to copy (origin, text) pairs across to the new
391
# content for any line that matches the last-checked parent.
392
# FIXME: save the sequence control data for delta compression
393
# against the most relevant parent rather than rediffing.
394
content._lines[j:j+n] = merge_content._lines[i:i+n]
397
def _get_components(self, version_id):
398
"""Return a list of (version_id, method, data) tuples that
399
makes up version specified by version_id of the knit.
401
The components should be applied in the order of the returned
404
The basis knit will be used to the largest extent possible
405
since it is assumed that accesses to it is faster.
407
# needed_revisions holds a list of (method, version_id) of
408
# versions that is needed to be fetched to construct the final
409
# version of the file.
411
# basis_revisions is a list of versions that needs to be
412
# fetched but exists in the basis knit.
414
basis = self.basis_knit
421
if basis and basis._index.has_version(cursor):
423
basis_versions.append(cursor)
424
method = picked_knit._index.get_method(cursor)
425
needed_versions.append((method, cursor))
426
if method == 'fulltext':
428
cursor = picked_knit.get_parents(cursor)[0]
433
for comp_id in basis_versions:
434
data_pos, data_size = basis._index.get_data_position(comp_id)
435
records.append((piece_id, data_pos, data_size))
436
components.update(basis._data.read_records(records))
439
for comp_id in [vid for method, vid in needed_versions
440
if vid not in basis_versions]:
441
data_pos, data_size = self._index.get_position(comp_id)
442
records.append((comp_id, data_pos, data_size))
443
components.update(self._data.read_records(records))
445
# get_data_records returns a mapping with the version id as
446
# index and the value as data. The order the components need
447
# to be applied is held by needed_versions (reversed).
449
for method, comp_id in reversed(needed_versions):
450
out.append((comp_id, method, components[comp_id]))
454
def _get_content(self, version_id, parent_texts={}):
455
"""Returns a content object that makes up the specified
457
if not self.has_version(version_id):
458
raise RevisionNotPresent(version_id, self.filename)
460
cached_version = parent_texts.get(version_id, None)
461
if cached_version is not None:
462
return cached_version
464
if self.basis_knit and version_id in self.basis_knit:
465
return self.basis_knit._get_content(version_id)
468
components = self._get_components(version_id)
469
for component_id, method, (data, digest) in components:
470
version_idx = self._index.lookup(component_id)
471
if method == 'fulltext':
472
assert content is None
473
content = self.factory.parse_fulltext(data, version_idx)
474
elif method == 'line-delta':
475
delta = self.factory.parse_line_delta(data, version_idx)
476
content.apply_delta(delta)
478
if 'no-eol' in self._index.get_options(version_id):
479
line = content._lines[-1][1].rstrip('\n')
480
content._lines[-1] = (content._lines[-1][0], line)
482
if sha_strings(content.text()) != digest:
483
raise KnitCorrupt(self.filename, 'sha-1 does not match')
487
def _check_versions_present(self, version_ids):
488
"""Check that all specified versions are present."""
489
version_ids = set(version_ids)
490
for r in list(version_ids):
491
if self._index.has_version(r):
492
version_ids.remove(r)
494
raise RevisionNotPresent(list(version_ids)[0], self.filename)
496
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
497
"""See VersionedFile.add_lines_with_ghosts()."""
498
self._check_add(version_id, lines)
499
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
501
def _add_lines(self, version_id, parents, lines, parent_texts):
502
"""See VersionedFile.add_lines."""
503
self._check_add(version_id, lines)
504
self._check_versions_present(parents)
505
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
507
def _check_add(self, version_id, lines):
508
"""check that version_id and lines are safe to add."""
509
assert self.writable, "knit is not opened for write"
510
### FIXME escape. RBC 20060228
511
if contains_whitespace(version_id):
512
raise InvalidRevisionId(version_id)
513
if self.has_version(version_id):
514
raise RevisionAlreadyPresent(version_id, self.filename)
516
if False or __debug__:
518
assert '\n' not in l[:-1]
520
def _add(self, version_id, lines, parents, delta, parent_texts):
521
"""Add a set of lines on top of version specified by parents.
523
If delta is true, compress the text as a line-delta against
526
Any versions not present will be converted into ghosts.
528
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
529
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
530
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
531
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
532
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
533
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
534
# +1383 0 8.0370 8.0370 +<len>
535
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
536
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
537
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
538
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
542
if parent_texts is None:
544
for parent in parents:
545
if not self.has_version(parent):
546
ghosts.append(parent)
548
present_parents.append(parent)
550
if delta and not len(present_parents):
553
digest = sha_strings(lines)
556
if lines[-1][-1] != '\n':
557
options.append('no-eol')
558
lines[-1] = lines[-1] + '\n'
560
if len(present_parents) and delta:
561
# To speed the extract of texts the delta chain is limited
562
# to a fixed number of deltas. This should minimize both
563
# I/O and the time spend applying deltas.
565
delta_parents = present_parents
567
parent = delta_parents[0]
568
method = self._index.get_method(parent)
569
if method == 'fulltext':
571
delta_parents = self._index.get_parents(parent)
573
if method == 'line-delta':
576
lines = self.factory.make(lines, version_id)
577
if delta or (self.factory.annotated and len(present_parents) > 0):
578
# Merge annotations from parent texts if so is needed.
579
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
580
delta, self.factory.annotated)
583
options.append('line-delta')
584
store_lines = self.factory.lower_line_delta(delta_hunks)
586
options.append('fulltext')
587
store_lines = self.factory.lower_fulltext(lines)
589
where, size = self._data.add_record(version_id, digest, store_lines)
590
self._index.add_version(version_id, options, where, size, parents)
593
def check(self, progress_bar=None):
594
"""See VersionedFile.check()."""
596
def _clone_text(self, new_version_id, old_version_id, parents):
597
"""See VersionedFile.clone_text()."""
598
# FIXME RBC 20060228 make fast by only inserting an index with null delta.
599
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
601
def get_lines(self, version_id):
602
"""See VersionedFile.get_lines()."""
603
return self._get_content(version_id).text()
605
def iter_lines_added_or_present_in_versions(self, version_ids=None):
606
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
607
if version_ids is None:
608
version_ids = self.versions()
609
# we dont care about inclusions, the caller cares.
610
# but we need to setup a list of records to visit.
611
# we need version_id, position, length
612
version_id_records = []
613
requested_versions = list(version_ids)
614
# filter for available versions
615
for version_id in requested_versions:
616
if not self.has_version(version_id):
617
raise RevisionNotPresent(version_id, self.filename)
618
# get a in-component-order queue:
620
for version_id in self.versions():
621
if version_id in requested_versions:
622
version_ids.append(version_id)
623
data_pos, length = self._index.get_position(version_id)
624
version_id_records.append((version_id, data_pos, length))
626
pb = bzrlib.ui.ui_factory.nested_progress_bar()
628
total = len(version_id_records)
630
pb.update('Walking content.', count, total)
631
for version_id, data, sha_value in \
632
self._data.read_records_iter(version_id_records):
633
pb.update('Walking content.', count, total)
634
method = self._index.get_method(version_id)
635
version_idx = self._index.lookup(version_id)
636
assert method in ('fulltext', 'line-delta')
637
if method == 'fulltext':
638
content = self.factory.parse_fulltext(data, version_idx)
639
for line in content.text():
642
delta = self.factory.parse_line_delta(data, version_idx)
643
for start, end, count, lines in delta:
644
for origin, line in lines:
647
pb.update('Walking content.', total, total)
650
pb.update('Walking content.', total, total)
654
def num_versions(self):
655
"""See VersionedFile.num_versions()."""
656
return self._index.num_versions()
658
__len__ = num_versions
660
def annotate_iter(self, version_id):
661
"""See VersionedFile.annotate_iter."""
662
content = self._get_content(version_id)
663
for origin, text in content.annotate_iter():
666
def get_parents(self, version_id):
667
"""See VersionedFile.get_parents."""
668
self._check_versions_present([version_id])
669
return list(self._index.get_parents(version_id))
671
def get_parents_with_ghosts(self, version_id):
672
"""See VersionedFile.get_parents."""
673
self._check_versions_present([version_id])
674
return list(self._index.get_parents_with_ghosts(version_id))
676
def get_ancestry(self, versions):
677
"""See VersionedFile.get_ancestry."""
678
if isinstance(versions, basestring):
679
versions = [versions]
682
self._check_versions_present(versions)
683
return self._index.get_ancestry(versions)
685
def get_ancestry_with_ghosts(self, versions):
686
"""See VersionedFile.get_ancestry_with_ghosts."""
687
if isinstance(versions, basestring):
688
versions = [versions]
691
self._check_versions_present(versions)
692
return self._index.get_ancestry_with_ghosts(versions)
694
#@deprecated_method(zero_eight)
695
def walk(self, version_ids):
696
"""See VersionedFile.walk."""
697
# We take the short path here, and extract all relevant texts
698
# and put them in a weave and let that do all the work. Far
699
# from optimal, but is much simpler.
700
# FIXME RB 20060228 this really is inefficient!
701
from bzrlib.weave import Weave
703
w = Weave(self.filename)
704
ancestry = self.get_ancestry(version_ids)
705
sorted_graph = topo_sort(self._index.get_graph())
706
version_list = [vid for vid in sorted_graph if vid in ancestry]
708
for version_id in version_list:
709
lines = self.get_lines(version_id)
710
w.add_lines(version_id, self.get_parents(version_id), lines)
712
for lineno, insert_id, dset, line in w.walk(version_ids):
713
yield lineno, insert_id, dset, line
716
class _KnitComponentFile(object):
717
"""One of the files used to implement a knit database"""
719
def __init__(self, transport, filename, mode):
720
self._transport = transport
721
self._filename = filename
724
def write_header(self):
725
if self._transport.append(self._filename, StringIO(self.HEADER)):
726
raise KnitCorrupt(self._filename, 'misaligned after writing header')
728
def check_header(self, fp):
729
line = fp.read(len(self.HEADER))
730
if line != self.HEADER:
731
raise KnitHeaderError(badline=line)
734
"""Commit is a nop."""
737
return '%s(%s)' % (self.__class__.__name__, self._filename)
740
class _KnitIndex(_KnitComponentFile):
741
"""Manages knit index file.
743
The index is already kept in memory and read on startup, to enable
744
fast lookups of revision information. The cursor of the index
745
file is always pointing to the end, making it easy to append
748
_cache is a cache for fast mapping from version id to a Index
751
_history is a cache for fast mapping from indexes to version ids.
753
The index data format is dictionary compressed when it comes to
754
parent references; a index entry may only have parents that with a
755
lover index number. As a result, the index is topological sorted.
757
Duplicate entries may be written to the index for a single version id
758
if this is done then the latter one completely replaces the former:
759
this allows updates to correct version and parent information.
760
Note that the two entries may share the delta, and that successive
761
annotations and references MUST point to the first entry.
764
HEADER = "# bzr knit index 7\n"
766
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
767
# __slots__ = ['_cache', '_history', '_transport', '_filename']
769
def _cache_version(self, version_id, options, pos, size, parents):
770
"""Cache a version record in the history array and index cache.
772
This is inlined into __init__ for performance. KEEP IN SYNC.
773
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
776
# only want the _history index to reference the 1st index entry
778
if version_id not in self._cache:
779
self._history.append(version_id)
780
self._cache[version_id] = (version_id, options, pos, size, parents)
782
def __init__(self, transport, filename, mode, create=False):
783
_KnitComponentFile.__init__(self, transport, filename, mode)
785
# position in _history is the 'official' index for a revision
786
# but the values may have come from a newer entry.
787
# so - wc -l of a knit index is != the number of uniqe names
790
pb = bzrlib.ui.ui_factory.nested_progress_bar()
795
pb.update('read knit index', count, total)
796
fp = self._transport.get(self._filename)
797
self.check_header(fp)
798
# readlines reads the whole file at once:
799
# bad for transports like http, good for local disk
800
# we save 60 ms doing this one change (
801
# from calling readline each time to calling
803
# probably what we want for nice behaviour on
804
# http is a incremental readlines that yields, or
805
# a check for local vs non local indexes,
806
for l in fp.readlines():
810
#pb.update('read knit index', count, total)
811
# See self._parse_parents
813
for value in rec[4:]:
815
# uncompressed reference
816
parents.append(value[1:])
818
# this is 15/4000ms faster than isinstance,
820
# this function is called thousands of times a
821
# second so small variations add up.
822
assert value.__class__ is str
823
parents.append(self._history[int(value)])
824
# end self._parse_parents
825
# self._cache_version(rec[0],
830
# --- self._cache_version
831
# only want the _history index to reference the 1st
832
# index entry for version_id
834
if version_id not in self._cache:
835
self._history.append(version_id)
836
self._cache[version_id] = (version_id,
841
# --- self._cache_version
842
except NoSuchFile, e:
843
if mode != 'w' or not create:
847
pb.update('read knit index', total, total)
850
def _parse_parents(self, compressed_parents):
851
"""convert a list of string parent values into version ids.
853
ints are looked up in the index.
854
.FOO values are ghosts and converted in to FOO.
856
NOTE: the function is retained here for clarity, and for possible
857
use in partial index reads. However bulk processing now has
858
it inlined in __init__ for inner-loop optimisation.
861
for value in compressed_parents:
863
# uncompressed reference
864
result.append(value[1:])
866
# this is 15/4000ms faster than isinstance,
867
# this function is called thousands of times a
868
# second so small variations add up.
869
assert value.__class__ is str
870
result.append(self._history[int(value)])
875
for version_id, index in self._cache.iteritems():
876
graph.append((version_id, index[4]))
879
def get_ancestry(self, versions):
880
"""See VersionedFile.get_ancestry."""
881
# get a graph of all the mentioned versions:
883
pending = set(versions)
885
version = pending.pop()
886
parents = self._cache[version][4]
889
parents = [parent for parent in parents if parent in self._cache]
890
for parent in parents:
891
# if not completed and not a ghost
892
if parent not in graph:
894
graph[version] = parents
895
return topo_sort(graph.items())
897
def get_ancestry_with_ghosts(self, versions):
898
"""See VersionedFile.get_ancestry_with_ghosts."""
899
# get a graph of all the mentioned versions:
901
pending = set(versions)
903
version = pending.pop()
905
parents = self._cache[version][4]
912
for parent in parents:
913
if parent not in graph:
915
graph[version] = parents
916
return topo_sort(graph.items())
918
def num_versions(self):
919
return len(self._history)
921
__len__ = num_versions
923
def get_versions(self):
926
def idx_to_name(self, idx):
927
return self._history[idx]
929
def lookup(self, version_id):
930
assert version_id in self._cache
931
return self._history.index(version_id)
933
def _version_list_to_index(self, versions):
935
for version in versions:
936
if version in self._cache:
937
result_list.append(str(self._history.index(version)))
939
result_list.append('.' + version.encode('utf-8'))
940
return ' '.join(result_list)
942
def add_version(self, version_id, options, pos, size, parents):
943
"""Add a version record to the index."""
944
self._cache_version(version_id, options, pos, size, parents)
946
content = "%s %s %s %s %s\n" % (version_id.encode('utf-8'),
950
self._version_list_to_index(parents))
951
assert isinstance(content, str), 'content must be utf-8 encoded'
952
self._transport.append(self._filename, StringIO(content))
954
def has_version(self, version_id):
955
"""True if the version is in the index."""
956
return self._cache.has_key(version_id)
958
def get_position(self, version_id):
959
"""Return data position and size of specified version."""
960
return (self._cache[version_id][2], \
961
self._cache[version_id][3])
963
def get_method(self, version_id):
964
"""Return compression method of specified version."""
965
options = self._cache[version_id][1]
966
if 'fulltext' in options:
969
assert 'line-delta' in options
972
def get_options(self, version_id):
973
return self._cache[version_id][1]
975
def get_parents(self, version_id):
976
"""Return parents of specified version ignoring ghosts."""
977
return [parent for parent in self._cache[version_id][4]
978
if parent in self._cache]
980
def get_parents_with_ghosts(self, version_id):
981
"""Return parents of specified version wth ghosts."""
982
return self._cache[version_id][4]
984
def check_versions_present(self, version_ids):
985
"""Check that all specified versions are present."""
986
version_ids = set(version_ids)
987
for version_id in list(version_ids):
988
if version_id in self._cache:
989
version_ids.remove(version_id)
991
raise RevisionNotPresent(list(version_ids)[0], self.filename)
994
class _KnitData(_KnitComponentFile):
995
"""Contents of the knit data file"""
997
HEADER = "# bzr knit data 7\n"
999
def __init__(self, transport, filename, mode, create=False):
1000
_KnitComponentFile.__init__(self, transport, filename, mode)
1002
self._checked = False
1004
self._transport.put(self._filename, StringIO(''))
1007
def clear_cache(self):
1008
"""Clear the record cache."""
1011
def _open_file(self):
1012
if self._file is None:
1014
self._file = self._transport.get(self._filename)
1019
def _record_to_data(self, version_id, digest, lines):
1020
"""Convert version_id, digest, lines into a raw data block.
1022
:return: (len, a StringIO instance with the raw data ready to read.)
1025
data_file = GzipFile(None, mode='wb', fileobj=sio)
1026
data_file.writelines(chain(
1027
["version %s %d %s\n" % (version_id.encode('utf-8'),
1031
["end %s\n\n" % version_id.encode('utf-8')]))
1038
def add_raw_record(self, raw_data):
1039
"""Append a prepared record to the data file."""
1040
assert isinstance(raw_data, str), 'data must be plain bytes'
1041
start_pos = self._transport.append(self._filename, StringIO(raw_data))
1042
return start_pos, len(raw_data)
1044
def add_record(self, version_id, digest, lines):
1045
"""Write new text record to disk. Returns the position in the
1046
file where it was written."""
1047
size, sio = self._record_to_data(version_id, digest, lines)
1049
self._records[version_id] = (digest, lines)
1051
start_pos = self._transport.append(self._filename, sio)
1052
return start_pos, size
1054
def _parse_record_header(self, version_id, raw_data):
1055
"""Parse a record header for consistency.
1057
:return: the header and the decompressor stream.
1058
as (stream, header_record)
1060
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1061
rec = df.readline().split()
1063
raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')
1064
if rec[1].decode('utf-8')!= version_id:
1065
raise KnitCorrupt(self._filename,
1066
'unexpected version, wanted %r, got %r' % (
1067
version_id, rec[1]))
1070
def _parse_record(self, version_id, data):
1071
df, rec = self._parse_record_header(version_id, data)
1073
record_contents = self._read_record_contents(df, lines)
1075
if l.decode('utf-8') != 'end %s\n' % version_id:
1076
raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'
1079
return record_contents, rec[3]
1081
def _read_record_contents(self, df, record_lines):
1082
"""Read and return n lines from datafile."""
1084
for i in range(record_lines):
1085
r.append(df.readline())
1088
def read_records_iter_raw(self, records):
1089
"""Read text records from data file and yield raw data.
1091
This unpacks enough of the text record to validate the id is
1092
as expected but thats all.
1094
It will actively recompress currently cached records on the
1095
basis that that is cheaper than I/O activity.
1098
for version_id, pos, size in records:
1099
if version_id not in self._records:
1100
needed_records.append((version_id, pos, size))
1102
# setup an iterator of the external records:
1103
# uses readv so nice and fast we hope.
1104
if len(needed_records):
1105
# grab the disk data needed.
1106
raw_records = self._transport.readv(self._filename,
1107
[(pos, size) for version_id, pos, size in needed_records])
1109
for version_id, pos, size in records:
1110
if version_id in self._records:
1111
# compress a new version
1112
size, sio = self._record_to_data(version_id,
1113
self._records[version_id][0],
1114
self._records[version_id][1])
1115
yield version_id, sio.getvalue()
1117
pos, data = raw_records.next()
1118
# validate the header
1119
df, rec = self._parse_record_header(version_id, data)
1121
yield version_id, data
1124
def read_records_iter(self, records):
1125
"""Read text records from data file and yield result.
1127
Each passed record is a tuple of (version_id, pos, len) and
1128
will be read in the given order. Yields (version_id,
1133
for version_id, pos, size in records:
1134
if version_id not in self._records:
1135
needed_records.append((version_id, pos, size))
1137
if len(needed_records):
1138
# We take it that the transport optimizes the fetching as good
1139
# as possible (ie, reads continous ranges.)
1140
response = self._transport.readv(self._filename,
1141
[(pos, size) for version_id, pos, size in needed_records])
1143
for (record_id, pos, size), (pos, data) in izip(iter(needed_records), response):
1144
content, digest = self._parse_record(record_id, data)
1145
self._records[record_id] = (digest, content)
1147
for version_id, pos, size in records:
1148
yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])
1150
def read_records(self, records):
1151
"""Read records into a dictionary."""
1153
for record_id, content, digest in self.read_records_iter(records):
1154
components[record_id] = (content, digest)
1158
class InterKnit(InterVersionedFile):
1159
"""Optimised code paths for knit to knit operations."""
1161
_matching_file_factory = KnitVersionedFile
1164
def is_compatible(source, target):
1165
"""Be compatible with knits. """
1167
return (isinstance(source, KnitVersionedFile) and
1168
isinstance(target, KnitVersionedFile))
1169
except AttributeError:
1172
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1173
"""See InterVersionedFile.join."""
1174
assert isinstance(self.source, KnitVersionedFile)
1175
assert isinstance(self.target, KnitVersionedFile)
1177
if version_ids is None:
1178
version_ids = self.source.versions()
1180
if not ignore_missing:
1181
self.source._check_versions_present(version_ids)
1183
version_ids = set(self.source.versions()).intersection(
1189
pb = bzrlib.ui.ui_factory.nested_progress_bar()
1191
version_ids = list(version_ids)
1192
if None in version_ids:
1193
version_ids.remove(None)
1195
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1196
this_versions = set(self.target._index.get_versions())
1197
needed_versions = self.source_ancestry - this_versions
1198
cross_check_versions = self.source_ancestry.intersection(this_versions)
1199
mismatched_versions = set()
1200
for version in cross_check_versions:
1201
# scan to include needed parents.
1202
n1 = set(self.target.get_parents_with_ghosts(version))
1203
n2 = set(self.source.get_parents_with_ghosts(version))
1205
# FIXME TEST this check for cycles being introduced works
1206
# the logic is we have a cycle if in our graph we are an
1207
# ancestor of any of the n2 revisions.
1213
parent_ancestors = self.source.get_ancestry(parent)
1214
if version in parent_ancestors:
1215
raise errors.GraphCycleError([parent, version])
1216
# ensure this parent will be available later.
1217
new_parents = n2.difference(n1)
1218
needed_versions.update(new_parents.difference(this_versions))
1219
mismatched_versions.add(version)
1221
if not needed_versions and not cross_check_versions:
1223
full_list = topo_sort(self.source.get_graph())
1225
version_list = [i for i in full_list if (not self.target.has_version(i)
1226
and i in needed_versions)]
1230
copy_queue_records = []
1232
for version_id in version_list:
1233
options = self.source._index.get_options(version_id)
1234
parents = self.source._index.get_parents_with_ghosts(version_id)
1235
# check that its will be a consistent copy:
1236
for parent in parents:
1237
# if source has the parent, we must :
1238
# * already have it or
1239
# * have it scheduled already
1240
# otherwise we dont care
1241
assert (self.target.has_version(parent) or
1242
parent in copy_set or
1243
not self.source.has_version(parent))
1244
data_pos, data_size = self.source._index.get_position(version_id)
1245
copy_queue_records.append((version_id, data_pos, data_size))
1246
copy_queue.append((version_id, options, parents))
1247
copy_set.add(version_id)
1249
# data suck the join:
1251
total = len(version_list)
1252
# we want the raw gzip for bulk copying, but the record validated
1253
# just enough to be sure its the right one.
1254
# TODO: consider writev or write combining to reduce
1255
# death of a thousand cuts feeling.
1256
for (version_id, raw_data), \
1257
(version_id2, options, parents) in \
1258
izip(self.source._data.read_records_iter_raw(copy_queue_records),
1260
assert version_id == version_id2, 'logic error, inconsistent results'
1262
pb.update("Joining knit", count, total)
1263
pos, size = self.target._data.add_raw_record(raw_data)
1264
self.target._index.add_version(version_id, options, pos, size, parents)
1266
for version in mismatched_versions:
1267
# FIXME RBC 20060309 is this needed?
1268
n1 = set(self.target.get_parents_with_ghosts(version))
1269
n2 = set(self.source.get_parents_with_ghosts(version))
1270
# write a combined record to our history preserving the current
1271
# parents as first in the list
1272
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
1273
self.target.fix_parents(version, new_parents)
1279
InterVersionedFile.register_optimiser(InterKnit)
1282
# make GzipFile faster:
1284
class GzipFile(gzip.GzipFile):
1285
"""Knit tuned version of GzipFile.
1287
This is based on the following lsprof stats:
1288
python 2.4 stock GzipFile write:
1289
58971 0 5644.3090 2721.4730 gzip:193(write)
1290
+58971 0 1159.5530 1159.5530 +<built-in method compress>
1291
+176913 0 987.0320 987.0320 +<len>
1292
+58971 0 423.1450 423.1450 +<zlib.crc32>
1293
+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.
1295
tuned GzipFile write:
1296
58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)
1297
+58971 0 1297.7620 1297.7620 +<built-in method compress>
1298
+58971 0 406.2160 406.2160 +<zlib.crc32>
1299
+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.
1301
+58971 0 328.2670 328.2670 +<len>
1304
Yes, its only 1.6 seconds, but they add up.
1307
def write(self, data):
1308
if self.mode != gzip.WRITE:
1310
raise IOError(errno.EBADF, "write() on read-only GzipFile object")
1312
if self.fileobj is None:
1313
raise ValueError, "write() on closed GzipFile object"
1314
data_len = len(data)
1316
self.size = self.size + data_len
1317
self.crc = zlib.crc32(data, self.crc)
1318
self.fileobj.write( self.compress.compress(data) )
1319
self.offset += data_len
1321
def writelines(self, lines):
1322
# profiling indicated a significant overhead
1323
# calling write for each line.
1324
# this batch call is a lot faster :).
1325
# (4 seconds to 1 seconds for the sample upgrades I was testing).
1326
self.write(''.join(lines))
1329
class SequenceMatcher(difflib.SequenceMatcher):
1330
"""Knit tuned sequence matcher.
1332
This is based on profiling of difflib which indicated some improvements
1333
for our usage pattern.
1336
def find_longest_match(self, alo, ahi, blo, bhi):
1337
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1339
If isjunk is not defined:
1341
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1342
alo <= i <= i+k <= ahi
1343
blo <= j <= j+k <= bhi
1344
and for all (i',j',k') meeting those conditions,
1347
and if i == i', j <= j'
1349
In other words, of all maximal matching blocks, return one that
1350
starts earliest in a, and of all those maximal matching blocks that
1351
start earliest in a, return the one that starts earliest in b.
1353
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1354
>>> s.find_longest_match(0, 5, 0, 9)
1357
If isjunk is defined, first the longest matching block is
1358
determined as above, but with the additional restriction that no
1359
junk element appears in the block. Then that block is extended as
1360
far as possible by matching (only) junk elements on both sides. So
1361
the resulting block never matches on junk except as identical junk
1362
happens to be adjacent to an "interesting" match.
1364
Here's the same example as before, but considering blanks to be
1365
junk. That prevents " abcd" from matching the " abcd" at the tail
1366
end of the second sequence directly. Instead only the "abcd" can
1367
match, and matches the leftmost "abcd" in the second sequence:
1369
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1370
>>> s.find_longest_match(0, 5, 0, 9)
1373
If no blocks match, return (alo, blo, 0).
1375
>>> s = SequenceMatcher(None, "ab", "c")
1376
>>> s.find_longest_match(0, 2, 0, 1)
1380
# CAUTION: stripping common prefix or suffix would be incorrect.
1384
# Longest matching block is "ab", but if common prefix is
1385
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1386
# strip, so ends up claiming that ab is changed to acab by
1387
# inserting "ca" in the middle. That's minimal but unintuitive:
1388
# "it's obvious" that someone inserted "ac" at the front.
1389
# Windiff ends up at the same place as diff, but by pairing up
1390
# the unique 'b's and then matching the first two 'a's.
1392
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1393
besti, bestj, bestsize = alo, blo, 0
1394
# find longest junk-free match
1395
# during an iteration of the loop, j2len[j] = length of longest
1396
# junk-free match ending with a[i-1] and b[j]
1400
for i in xrange(alo, ahi):
1401
# look at all instances of a[i] in b; note that because
1402
# b2j has no junk keys, the loop is skipped if a[i] is junk
1403
j2lenget = j2len.get
1406
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1407
# following improvement
1408
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1409
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1410
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1412
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1413
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1414
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1426
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1428
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1431
# Extend the best by non-junk elements on each end. In particular,
1432
# "popular" non-junk elements aren't in b2j, which greatly speeds
1433
# the inner loop above, but also means "the best" match so far
1434
# doesn't contain any junk *or* popular non-junk elements.
1435
while besti > alo and bestj > blo and \
1436
not isbjunk(b[bestj-1]) and \
1437
a[besti-1] == b[bestj-1]:
1438
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1439
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1440
not isbjunk(b[bestj+bestsize]) and \
1441
a[besti+bestsize] == b[bestj+bestsize]:
1444
# Now that we have a wholly interesting match (albeit possibly
1445
# empty!), we may as well suck up the matching junk on each
1446
# side of it too. Can't think of a good reason not to, and it
1447
# saves post-processing the (possibly considerable) expense of
1448
# figuring out what to do with it. In the case of an empty
1449
# interesting match, this is clearly the right thing to do,
1450
# because no other kind of match is possible in the regions.
1451
while besti > alo and bestj > blo and \
1452
isbjunk(b[bestj-1]) and \
1453
a[besti-1] == b[bestj-1]:
1454
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1455
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1456
isbjunk(b[bestj+bestsize]) and \
1457
a[besti+bestsize] == b[bestj+bestsize]:
1458
bestsize = bestsize + 1
1460
return besti, bestj, bestsize