1
# Copyright (C) 2005, 2006, 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Knit versionedfile implementation.
19
A knit is a versioned file implementation that supports efficient append only
23
lifeless: the data file is made up of "delta records". each delta record has a delta header
24
that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of
25
the -expanded data- (ie, the delta applied to the parent). the delta also ends with a
26
end-marker; simply "end VERSION"
28
delta can be line or full contents.a
29
... the 8's there are the index number of the annotation.
30
version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e
34
8 e.set('executable', 'yes')
36
8 if elt.get('executable') == 'yes':
37
8 ie.executable = True
38
end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad
42
09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents
43
09:33 < jrydberg> lifeless: the parents are currently dictionary compressed
44
09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)
45
09:33 < lifeless> right
46
09:33 < jrydberg> lifeless: the position and size is the range in the data file
49
so the index sequence is the dictionary compressed sequence number used
50
in the deltas to provide line annotation
55
# 10:16 < lifeless> make partial index writes safe
56
# 10:16 < lifeless> implement 'knit.check()' like weave.check()
57
# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave
59
# move sha1 out of the content so that join is faster at verifying parents
60
# record content length ?
64
from cStringIO import StringIO
66
from itertools import izip, chain
82
from bzrlib.errors import (
90
RevisionAlreadyPresent,
92
from bzrlib.tuned_gzip import GzipFile
93
from bzrlib.osutils import (
98
from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed
99
from bzrlib.tsort import topo_sort
102
from bzrlib.versionedfile import VersionedFile, InterVersionedFile
105
# TODO: Split out code specific to this format into an associated object.
107
# TODO: Can we put in some kind of value to check that the index and data
108
# files belong together?
110
# TODO: accommodate binaries, perhaps by storing a byte count
112
# TODO: function to check whole file
114
# TODO: atomically append data, then measure backwards from the cursor
115
# position after writing to work out where it was located. we may need to
116
# bypass python file buffering.
118
DATA_SUFFIX = '.knit'
119
INDEX_SUFFIX = '.kndx'
122
class KnitContent(object):
123
"""Content of a knit version to which deltas can be applied."""
125
def __init__(self, lines):
128
def annotate_iter(self):
129
"""Yield tuples of (origin, text) for each content line."""
130
return iter(self._lines)
133
"""Return a list of (origin, text) tuples."""
134
return list(self.annotate_iter())
136
def line_delta_iter(self, new_lines):
137
"""Generate line-based delta from this content to new_lines."""
138
new_texts = new_lines.text()
139
old_texts = self.text()
140
s = KnitSequenceMatcher(None, old_texts, new_texts)
141
for tag, i1, i2, j1, j2 in s.get_opcodes():
144
# ofrom, oto, length, data
145
yield i1, i2, j2 - j1, new_lines._lines[j1:j2]
147
def line_delta(self, new_lines):
148
return list(self.line_delta_iter(new_lines))
151
return [text for origin, text in self._lines]
154
return KnitContent(self._lines[:])
157
def get_line_delta_blocks(knit_delta, source, target):
158
"""Extract SequenceMatcher.get_matching_blocks() from a knit delta"""
159
target_len = len(target)
162
for s_begin, s_end, t_len, new_text in knit_delta:
163
true_n = s_begin - s_pos
166
# knit deltas do not provide reliable info about whether the
167
# last line of a file matches, due to eol handling.
168
if source[s_pos + n -1] != target[t_pos + n -1]:
171
yield s_pos, t_pos, n
172
t_pos += t_len + true_n
174
n = target_len - t_pos
176
if source[s_pos + n -1] != target[t_pos + n -1]:
179
yield s_pos, t_pos, n
180
yield s_pos + (target_len - t_pos), target_len, 0
183
class _KnitFactory(object):
184
"""Base factory for creating content objects."""
186
def make(self, lines, version_id):
187
num_lines = len(lines)
188
return KnitContent(zip([version_id] * num_lines, lines))
191
class KnitAnnotateFactory(_KnitFactory):
192
"""Factory for creating annotated Content objects."""
196
def parse_fulltext(self, content, version_id):
197
"""Convert fulltext to internal representation
199
fulltext content is of the format
200
revid(utf8) plaintext\n
201
internal representation is of the format:
204
# TODO: jam 20070209 The tests expect this to be returned as tuples,
205
# but the code itself doesn't really depend on that.
206
# Figure out a way to not require the overhead of turning the
207
# list back into tuples.
208
lines = [tuple(line.split(' ', 1)) for line in content]
209
return KnitContent(lines)
211
def parse_line_delta_iter(self, lines):
212
return iter(self.parse_line_delta(lines))
214
def parse_line_delta(self, lines, version_id):
215
"""Convert a line based delta into internal representation.
217
line delta is in the form of:
218
intstart intend intcount
220
revid(utf8) newline\n
221
internal representation is
222
(start, end, count, [1..count tuples (revid, newline)])
229
def cache_and_return(line):
230
origin, text = line.split(' ', 1)
231
return cache.setdefault(origin, origin), text
233
# walk through the lines parsing.
235
start, end, count = [int(n) for n in header.split(',')]
236
contents = [tuple(next().split(' ', 1)) for i in xrange(count)]
237
result.append((start, end, count, contents))
240
def get_fulltext_content(self, lines):
241
"""Extract just the content lines from a fulltext."""
242
return (line.split(' ', 1)[1] for line in lines)
244
def get_linedelta_content(self, lines):
245
"""Extract just the content from a line delta.
247
This doesn't return all of the extra information stored in a delta.
248
Only the actual content lines.
253
header = header.split(',')
254
count = int(header[2])
255
for i in xrange(count):
256
origin, text = next().split(' ', 1)
259
def lower_fulltext(self, content):
260
"""convert a fulltext content record into a serializable form.
262
see parse_fulltext which this inverts.
264
# TODO: jam 20070209 We only do the caching thing to make sure that
265
# the origin is a valid utf-8 line, eventually we could remove it
266
return ['%s %s' % (o, t) for o, t in content._lines]
268
def lower_line_delta(self, delta):
269
"""convert a delta into a serializable form.
271
See parse_line_delta which this inverts.
273
# TODO: jam 20070209 We only do the caching thing to make sure that
274
# the origin is a valid utf-8 line, eventually we could remove it
276
for start, end, c, lines in delta:
277
out.append('%d,%d,%d\n' % (start, end, c))
278
out.extend(origin + ' ' + text
279
for origin, text in lines)
283
class KnitPlainFactory(_KnitFactory):
284
"""Factory for creating plain Content objects."""
288
def parse_fulltext(self, content, version_id):
289
"""This parses an unannotated fulltext.
291
Note that this is not a noop - the internal representation
292
has (versionid, line) - its just a constant versionid.
294
return self.make(content, version_id)
296
def parse_line_delta_iter(self, lines, version_id):
298
num_lines = len(lines)
299
while cur < num_lines:
302
start, end, c = [int(n) for n in header.split(',')]
303
yield start, end, c, zip([version_id] * c, lines[cur:cur+c])
306
def parse_line_delta(self, lines, version_id):
307
return list(self.parse_line_delta_iter(lines, version_id))
309
def get_fulltext_content(self, lines):
310
"""Extract just the content lines from a fulltext."""
313
def get_linedelta_content(self, lines):
314
"""Extract just the content from a line delta.
316
This doesn't return all of the extra information stored in a delta.
317
Only the actual content lines.
322
header = header.split(',')
323
count = int(header[2])
324
for i in xrange(count):
327
def lower_fulltext(self, content):
328
return content.text()
330
def lower_line_delta(self, delta):
332
for start, end, c, lines in delta:
333
out.append('%d,%d,%d\n' % (start, end, c))
334
out.extend([text for origin, text in lines])
338
def make_empty_knit(transport, relpath):
339
"""Construct a empty knit at the specified location."""
340
k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)
344
class KnitVersionedFile(VersionedFile):
345
"""Weave-like structure with faster random access.
347
A knit stores a number of texts and a summary of the relationships
348
between them. Texts are identified by a string version-id. Texts
349
are normally stored and retrieved as a series of lines, but can
350
also be passed as single strings.
352
Lines are stored with the trailing newline (if any) included, to
353
avoid special cases for files with no final newline. Lines are
354
composed of 8-bit characters, not unicode. The combination of
355
these approaches should mean any 'binary' file can be safely
356
stored and retrieved.
359
def __init__(self, relpath, transport, file_mode=None, access_mode=None,
360
factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,
361
create=False, create_parent_dir=False, delay_create=False,
362
dir_mode=None, index=None):
363
"""Construct a knit at location specified by relpath.
365
:param create: If not True, only open an existing knit.
366
:param create_parent_dir: If True, create the parent directory if
367
creating the file fails. (This is used for stores with
368
hash-prefixes that may not exist yet)
369
:param delay_create: The calling code is aware that the knit won't
370
actually be created until the first data is stored.
371
:param index: An index to use for the knit.
373
if deprecated_passed(basis_knit):
374
warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"
375
" deprecated as of bzr 0.9.",
376
DeprecationWarning, stacklevel=2)
377
if access_mode is None:
379
super(KnitVersionedFile, self).__init__(access_mode)
380
assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode
381
self.transport = transport
382
self.filename = relpath
383
self.factory = factory or KnitAnnotateFactory()
384
self.writable = (access_mode == 'w')
387
self._max_delta_chain = 200
390
self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,
391
access_mode, create=create, file_mode=file_mode,
392
create_parent_dir=create_parent_dir, delay_create=delay_create,
396
self._data = _KnitData(transport, relpath + DATA_SUFFIX,
397
access_mode, create=create and not len(self), file_mode=file_mode,
398
create_parent_dir=create_parent_dir, delay_create=delay_create,
402
return '%s(%s)' % (self.__class__.__name__,
403
self.transport.abspath(self.filename))
405
def _check_should_delta(self, first_parents):
406
"""Iterate back through the parent listing, looking for a fulltext.
408
This is used when we want to decide whether to add a delta or a new
409
fulltext. It searches for _max_delta_chain parents. When it finds a
410
fulltext parent, it sees if the total size of the deltas leading up to
411
it is large enough to indicate that we want a new full text anyway.
413
Return True if we should create a new delta, False if we should use a
418
delta_parents = first_parents
419
for count in xrange(self._max_delta_chain):
420
parent = delta_parents[0]
421
method = self._index.get_method(parent)
422
pos, size = self._index.get_position(parent)
423
if method == 'fulltext':
427
delta_parents = self._index.get_parents(parent)
429
# We couldn't find a fulltext, so we must create a new one
432
return fulltext_size > delta_size
434
def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):
435
"""See VersionedFile._add_delta()."""
436
self._check_add(version_id, []) # should we check the lines ?
437
self._check_versions_present(parents)
441
for parent in parents:
442
if not self.has_version(parent):
443
ghosts.append(parent)
445
present_parents.append(parent)
447
if delta_parent is None:
448
# reconstitute as full text.
449
assert len(delta) == 1 or len(delta) == 0
451
assert delta[0][0] == 0
452
assert delta[0][1] == 0, delta[0][1]
453
return super(KnitVersionedFile, self)._add_delta(version_id,
464
options.append('no-eol')
466
if delta_parent is not None:
467
# determine the current delta chain length.
468
# To speed the extract of texts the delta chain is limited
469
# to a fixed number of deltas. This should minimize both
470
# I/O and the time spend applying deltas.
471
# The window was changed to a maximum of 200 deltas, but also added
472
# was a check that the total compressed size of the deltas is
473
# smaller than the compressed size of the fulltext.
474
if not self._check_should_delta([delta_parent]):
475
# We don't want a delta here, just do a normal insertion.
476
return super(KnitVersionedFile, self)._add_delta(version_id,
483
options.append('line-delta')
484
store_lines = self.factory.lower_line_delta(delta)
486
where, size = self._data.add_record(version_id, digest, store_lines)
487
self._index.add_version(version_id, options, where, size, parents)
489
def _add_raw_records(self, records, data):
490
"""Add all the records 'records' with data pre-joined in 'data'.
492
:param records: A list of tuples(version_id, options, parents, size).
493
:param data: The data for the records. When it is written, the records
494
are adjusted to have pos pointing into data by the sum of
495
the preceding records sizes.
498
pos = self._data.add_raw_record(data)
501
for (version_id, options, parents, size) in records:
502
index_entries.append((version_id, options, pos+offset,
504
if self._data._do_cache:
505
self._data._cache[version_id] = data[offset:offset+size]
507
self._index.add_versions(index_entries)
509
def enable_cache(self):
510
"""Start caching data for this knit"""
511
self._data.enable_cache()
513
def clear_cache(self):
514
"""Clear the data cache only."""
515
self._data.clear_cache()
517
def copy_to(self, name, transport):
518
"""See VersionedFile.copy_to()."""
519
# copy the current index to a temp index to avoid racing with local
521
transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',
522
self.transport.get(self._index._filename))
524
f = self._data._open_file()
526
transport.put_file(name + DATA_SUFFIX, f)
529
# move the copied index into place
530
transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)
532
def create_empty(self, name, transport, mode=None):
533
return KnitVersionedFile(name, transport, factory=self.factory,
534
delta=self.delta, create=True)
536
def _fix_parents(self, version_id, new_parents):
537
"""Fix the parents list for version.
539
This is done by appending a new version to the index
540
with identical data except for the parents list.
541
the parents list must be a superset of the current
544
current_values = self._index._cache[version_id]
545
assert set(current_values[4]).difference(set(new_parents)) == set()
546
self._index.add_version(version_id,
552
def _extract_blocks(self, version_id, source, target):
553
if self._index.get_method(version_id) != 'line-delta':
555
parent, sha1, noeol, delta = self.get_delta(version_id)
556
return KnitContent.get_line_delta_blocks(delta, source, target)
558
def get_delta(self, version_id):
559
"""Get a delta for constructing version from some other version."""
560
version_id = osutils.safe_revision_id(version_id)
561
self.check_not_reserved_id(version_id)
562
if not self.has_version(version_id):
563
raise RevisionNotPresent(version_id, self.filename)
565
parents = self.get_parents(version_id)
570
data_pos, data_size = self._index.get_position(version_id)
571
data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]
572
noeol = 'no-eol' in self._index.get_options(version_id)
573
if 'fulltext' == self._index.get_method(version_id):
574
new_content = self.factory.parse_fulltext(data, version_id)
575
if parent is not None:
576
reference_content = self._get_content(parent)
577
old_texts = reference_content.text()
580
new_texts = new_content.text()
581
delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)
582
return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)
584
delta = self.factory.parse_line_delta(data, version_id)
585
return parent, sha1, noeol, delta
587
def get_graph_with_ghosts(self):
588
"""See VersionedFile.get_graph_with_ghosts()."""
589
graph_items = self._index.get_graph()
590
return dict(graph_items)
592
def get_sha1(self, version_id):
593
return self.get_sha1s([version_id])[0]
595
def get_sha1s(self, version_ids):
596
"""See VersionedFile.get_sha1()."""
597
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
598
record_map = self._get_record_map(version_ids)
599
# record entry 2 is the 'digest'.
600
return [record_map[v][2] for v in version_ids]
604
"""See VersionedFile.get_suffixes()."""
605
return [DATA_SUFFIX, INDEX_SUFFIX]
607
def has_ghost(self, version_id):
608
"""True if there is a ghost reference in the file to version_id."""
609
version_id = osutils.safe_revision_id(version_id)
611
if self.has_version(version_id):
613
# optimisable if needed by memoising the _ghosts set.
614
items = self._index.get_graph()
615
for node, parents in items:
616
for parent in parents:
617
if parent not in self._index._cache:
618
if parent == version_id:
623
"""See VersionedFile.versions."""
624
return self._index.get_versions()
626
def has_version(self, version_id):
627
"""See VersionedFile.has_version."""
628
version_id = osutils.safe_revision_id(version_id)
629
return self._index.has_version(version_id)
631
__contains__ = has_version
633
def _merge_annotations(self, content, parents, parent_texts={},
634
delta=None, annotated=None):
635
"""Merge annotations for content. This is done by comparing
636
the annotations based on changed to the text.
640
for parent_id in parents:
641
merge_content = self._get_content(parent_id, parent_texts)
642
seq = patiencediff.PatienceSequenceMatcher(
643
None, merge_content.text(), content.text())
644
if delta_seq is None:
645
# setup a delta seq to reuse.
647
for i, j, n in seq.get_matching_blocks():
650
# this appears to copy (origin, text) pairs across to the new
651
# content for any line that matches the last-checked parent.
652
# FIXME: save the sequence control data for delta compression
653
# against the most relevant parent rather than rediffing.
654
content._lines[j:j+n] = merge_content._lines[i:i+n]
657
reference_content = self._get_content(parents[0], parent_texts)
658
new_texts = content.text()
659
old_texts = reference_content.text()
660
delta_seq = patiencediff.PatienceSequenceMatcher(
661
None, old_texts, new_texts)
662
return self._make_line_delta(delta_seq, content)
664
def _make_line_delta(self, delta_seq, new_content):
665
"""Generate a line delta from delta_seq and new_content."""
667
for op in delta_seq.get_opcodes():
670
diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))
673
def _get_components_positions(self, version_ids):
674
"""Produce a map of position data for the components of versions.
676
This data is intended to be used for retrieving the knit records.
678
A dict of version_id to (method, data_pos, data_size, next) is
680
method is the way referenced data should be applied.
681
data_pos is the position of the data in the knit.
682
data_size is the size of the data in the knit.
683
next is the build-parent of the version, or None for fulltexts.
686
for version_id in version_ids:
689
while cursor is not None and cursor not in component_data:
690
method = self._index.get_method(cursor)
691
if method == 'fulltext':
694
next = self.get_parents(cursor)[0]
695
data_pos, data_size = self._index.get_position(cursor)
696
component_data[cursor] = (method, data_pos, data_size, next)
698
return component_data
700
def _get_content(self, version_id, parent_texts={}):
701
"""Returns a content object that makes up the specified
703
if not self.has_version(version_id):
704
raise RevisionNotPresent(version_id, self.filename)
706
cached_version = parent_texts.get(version_id, None)
707
if cached_version is not None:
708
return cached_version
710
text_map, contents_map = self._get_content_maps([version_id])
711
return contents_map[version_id]
713
def _check_versions_present(self, version_ids):
714
"""Check that all specified versions are present."""
715
self._index.check_versions_present(version_ids)
717
def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):
718
"""See VersionedFile.add_lines_with_ghosts()."""
719
self._check_add(version_id, lines)
720
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
722
def _add_lines(self, version_id, parents, lines, parent_texts):
723
"""See VersionedFile.add_lines."""
724
self._check_add(version_id, lines)
725
self._check_versions_present(parents)
726
return self._add(version_id, lines[:], parents, self.delta, parent_texts)
728
def _check_add(self, version_id, lines):
729
"""check that version_id and lines are safe to add."""
730
assert self.writable, "knit is not opened for write"
731
### FIXME escape. RBC 20060228
732
if contains_whitespace(version_id):
733
raise InvalidRevisionId(version_id, self.filename)
734
self.check_not_reserved_id(version_id)
735
if self.has_version(version_id):
736
raise RevisionAlreadyPresent(version_id, self.filename)
737
self._check_lines_not_unicode(lines)
738
self._check_lines_are_lines(lines)
740
def _add(self, version_id, lines, parents, delta, parent_texts):
741
"""Add a set of lines on top of version specified by parents.
743
If delta is true, compress the text as a line-delta against
746
Any versions not present will be converted into ghosts.
748
# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)
749
# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)
750
# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)
751
# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)
752
# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)
753
# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)
754
# +1383 0 8.0370 8.0370 +<len>
755
# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)
756
# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)
757
# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)
758
# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)
762
if parent_texts is None:
764
for parent in parents:
765
if not self.has_version(parent):
766
ghosts.append(parent)
768
present_parents.append(parent)
770
if delta and not len(present_parents):
773
digest = sha_strings(lines)
776
if lines[-1][-1] != '\n':
777
options.append('no-eol')
778
lines[-1] = lines[-1] + '\n'
780
if len(present_parents) and delta:
781
# To speed the extract of texts the delta chain is limited
782
# to a fixed number of deltas. This should minimize both
783
# I/O and the time spend applying deltas.
784
delta = self._check_should_delta(present_parents)
786
assert isinstance(version_id, str)
787
lines = self.factory.make(lines, version_id)
788
if delta or (self.factory.annotated and len(present_parents) > 0):
789
# Merge annotations from parent texts if so is needed.
790
delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,
791
delta, self.factory.annotated)
794
options.append('line-delta')
795
store_lines = self.factory.lower_line_delta(delta_hunks)
797
options.append('fulltext')
798
store_lines = self.factory.lower_fulltext(lines)
800
where, size = self._data.add_record(version_id, digest, store_lines)
801
self._index.add_version(version_id, options, where, size, parents)
804
def check(self, progress_bar=None):
805
"""See VersionedFile.check()."""
807
def _clone_text(self, new_version_id, old_version_id, parents):
808
"""See VersionedFile.clone_text()."""
809
# FIXME RBC 20060228 make fast by only inserting an index with null
811
self.add_lines(new_version_id, parents, self.get_lines(old_version_id))
813
def get_lines(self, version_id):
814
"""See VersionedFile.get_lines()."""
815
return self.get_line_list([version_id])[0]
817
def _get_record_map(self, version_ids):
818
"""Produce a dictionary of knit records.
820
The keys are version_ids, the values are tuples of (method, content,
822
method is the way the content should be applied.
823
content is a KnitContent object.
824
digest is the SHA1 digest of this version id after all steps are done
825
next is the build-parent of the version, i.e. the leftmost ancestor.
826
If the method is fulltext, next will be None.
828
position_map = self._get_components_positions(version_ids)
829
# c = component_id, m = method, p = position, s = size, n = next
830
records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]
832
for component_id, content, digest in \
833
self._data.read_records_iter(records):
834
method, position, size, next = position_map[component_id]
835
record_map[component_id] = method, content, digest, next
839
def get_text(self, version_id):
840
"""See VersionedFile.get_text"""
841
return self.get_texts([version_id])[0]
843
def get_texts(self, version_ids):
844
return [''.join(l) for l in self.get_line_list(version_ids)]
846
def get_line_list(self, version_ids):
847
"""Return the texts of listed versions as a list of strings."""
848
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
849
for version_id in version_ids:
850
self.check_not_reserved_id(version_id)
851
text_map, content_map = self._get_content_maps(version_ids)
852
return [text_map[v] for v in version_ids]
854
_get_lf_split_line_list = get_line_list
856
def _get_content_maps(self, version_ids):
857
"""Produce maps of text and KnitContents
859
:return: (text_map, content_map) where text_map contains the texts for
860
the requested versions and content_map contains the KnitContents.
861
Both dicts take version_ids as their keys.
863
for version_id in version_ids:
864
if not self.has_version(version_id):
865
raise RevisionNotPresent(version_id, self.filename)
866
record_map = self._get_record_map(version_ids)
871
for version_id in version_ids:
874
while cursor is not None:
875
method, data, digest, next = record_map[cursor]
876
components.append((cursor, method, data, digest))
877
if cursor in content_map:
882
for component_id, method, data, digest in reversed(components):
883
if component_id in content_map:
884
content = content_map[component_id]
886
if method == 'fulltext':
887
assert content is None
888
content = self.factory.parse_fulltext(data, version_id)
889
elif method == 'line-delta':
890
delta = self.factory.parse_line_delta(data, version_id)
891
content = content.copy()
892
content._lines = self._apply_delta(content._lines,
894
content_map[component_id] = content
896
if 'no-eol' in self._index.get_options(version_id):
897
content = content.copy()
898
line = content._lines[-1][1].rstrip('\n')
899
content._lines[-1] = (content._lines[-1][0], line)
900
final_content[version_id] = content
902
# digest here is the digest from the last applied component.
903
text = content.text()
904
if sha_strings(text) != digest:
905
raise KnitCorrupt(self.filename,
906
'sha-1 does not match %s' % version_id)
908
text_map[version_id] = text
909
return text_map, final_content
911
def iter_lines_added_or_present_in_versions(self, version_ids=None,
913
"""See VersionedFile.iter_lines_added_or_present_in_versions()."""
914
if version_ids is None:
915
version_ids = self.versions()
917
version_ids = [osutils.safe_revision_id(v) for v in version_ids]
919
pb = progress.DummyProgress()
920
# we don't care about inclusions, the caller cares.
921
# but we need to setup a list of records to visit.
922
# we need version_id, position, length
923
version_id_records = []
924
requested_versions = set(version_ids)
925
# filter for available versions
926
for version_id in requested_versions:
927
if not self.has_version(version_id):
928
raise RevisionNotPresent(version_id, self.filename)
929
# get a in-component-order queue:
930
for version_id in self.versions():
931
if version_id in requested_versions:
932
data_pos, length = self._index.get_position(version_id)
933
version_id_records.append((version_id, data_pos, length))
935
total = len(version_id_records)
936
for version_idx, (version_id, data, sha_value) in \
937
enumerate(self._data.read_records_iter(version_id_records)):
938
pb.update('Walking content.', version_idx, total)
939
method = self._index.get_method(version_id)
941
assert method in ('fulltext', 'line-delta')
942
if method == 'fulltext':
943
line_iterator = self.factory.get_fulltext_content(data)
945
line_iterator = self.factory.get_linedelta_content(data)
946
for line in line_iterator:
949
pb.update('Walking content.', total, total)
951
def iter_parents(self, version_ids):
952
"""Iterate through the parents for many version ids.
954
:param version_ids: An iterable yielding version_ids.
955
:return: An iterator that yields (version_id, parents). Requested
956
version_ids not present in the versioned file are simply skipped.
957
The order is undefined, allowing for different optimisations in
958
the underlying implementation.
960
version_ids = [osutils.safe_revision_id(version_id) for
961
version_id in version_ids]
962
return self._index.iter_parents(version_ids)
964
def num_versions(self):
965
"""See VersionedFile.num_versions()."""
966
return self._index.num_versions()
968
__len__ = num_versions
970
def annotate_iter(self, version_id):
971
"""See VersionedFile.annotate_iter."""
972
version_id = osutils.safe_revision_id(version_id)
973
content = self._get_content(version_id)
974
for origin, text in content.annotate_iter():
977
def get_parents(self, version_id):
978
"""See VersionedFile.get_parents."""
981
# 52554 calls in 1264 872 internal down from 3674
982
version_id = osutils.safe_revision_id(version_id)
984
return self._index.get_parents(version_id)
986
raise RevisionNotPresent(version_id, self.filename)
988
def get_parents_with_ghosts(self, version_id):
989
"""See VersionedFile.get_parents."""
990
version_id = osutils.safe_revision_id(version_id)
992
return self._index.get_parents_with_ghosts(version_id)
994
raise RevisionNotPresent(version_id, self.filename)
996
def get_ancestry(self, versions, topo_sorted=True):
997
"""See VersionedFile.get_ancestry."""
998
if isinstance(versions, basestring):
999
versions = [versions]
1002
versions = [osutils.safe_revision_id(v) for v in versions]
1003
return self._index.get_ancestry(versions, topo_sorted)
1005
def get_ancestry_with_ghosts(self, versions):
1006
"""See VersionedFile.get_ancestry_with_ghosts."""
1007
if isinstance(versions, basestring):
1008
versions = [versions]
1011
versions = [osutils.safe_revision_id(v) for v in versions]
1012
return self._index.get_ancestry_with_ghosts(versions)
1014
#@deprecated_method(zero_eight)
1015
def walk(self, version_ids):
1016
"""See VersionedFile.walk."""
1017
# We take the short path here, and extract all relevant texts
1018
# and put them in a weave and let that do all the work. Far
1019
# from optimal, but is much simpler.
1020
# FIXME RB 20060228 this really is inefficient!
1021
from bzrlib.weave import Weave
1023
w = Weave(self.filename)
1024
ancestry = set(self.get_ancestry(version_ids, topo_sorted=False))
1025
sorted_graph = topo_sort(self._index.get_graph())
1026
version_list = [vid for vid in sorted_graph if vid in ancestry]
1028
for version_id in version_list:
1029
lines = self.get_lines(version_id)
1030
w.add_lines(version_id, self.get_parents(version_id), lines)
1032
for lineno, insert_id, dset, line in w.walk(version_ids):
1033
yield lineno, insert_id, dset, line
1035
def plan_merge(self, ver_a, ver_b):
1036
"""See VersionedFile.plan_merge."""
1037
ver_a = osutils.safe_revision_id(ver_a)
1038
ver_b = osutils.safe_revision_id(ver_b)
1039
ancestors_b = set(self.get_ancestry(ver_b, topo_sorted=False))
1041
ancestors_a = set(self.get_ancestry(ver_a, topo_sorted=False))
1042
annotated_a = self.annotate(ver_a)
1043
annotated_b = self.annotate(ver_b)
1044
return merge._plan_annotate_merge(annotated_a, annotated_b,
1045
ancestors_a, ancestors_b)
1048
class _KnitComponentFile(object):
1049
"""One of the files used to implement a knit database"""
1051
def __init__(self, transport, filename, mode, file_mode=None,
1052
create_parent_dir=False, dir_mode=None):
1053
self._transport = transport
1054
self._filename = filename
1056
self._file_mode = file_mode
1057
self._dir_mode = dir_mode
1058
self._create_parent_dir = create_parent_dir
1059
self._need_to_create = False
1061
def _full_path(self):
1062
"""Return the full path to this file."""
1063
return self._transport.base + self._filename
1065
def check_header(self, fp):
1066
line = fp.readline()
1068
# An empty file can actually be treated as though the file doesn't
1070
raise errors.NoSuchFile(self._full_path())
1071
if line != self.HEADER:
1072
raise KnitHeaderError(badline=line,
1073
filename=self._transport.abspath(self._filename))
1076
"""Commit is a nop."""
1079
return '%s(%s)' % (self.__class__.__name__, self._filename)
1082
class _KnitIndex(_KnitComponentFile):
1083
"""Manages knit index file.
1085
The index is already kept in memory and read on startup, to enable
1086
fast lookups of revision information. The cursor of the index
1087
file is always pointing to the end, making it easy to append
1090
_cache is a cache for fast mapping from version id to a Index
1093
_history is a cache for fast mapping from indexes to version ids.
1095
The index data format is dictionary compressed when it comes to
1096
parent references; a index entry may only have parents that with a
1097
lover index number. As a result, the index is topological sorted.
1099
Duplicate entries may be written to the index for a single version id
1100
if this is done then the latter one completely replaces the former:
1101
this allows updates to correct version and parent information.
1102
Note that the two entries may share the delta, and that successive
1103
annotations and references MUST point to the first entry.
1105
The index file on disc contains a header, followed by one line per knit
1106
record. The same revision can be present in an index file more than once.
1107
The first occurrence gets assigned a sequence number starting from 0.
1109
The format of a single line is
1110
REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n
1111
REVISION_ID is a utf8-encoded revision id
1112
FLAGS is a comma separated list of flags about the record. Values include
1113
no-eol, line-delta, fulltext.
1114
BYTE_OFFSET is the ascii representation of the byte offset in the data file
1115
that the the compressed data starts at.
1116
LENGTH is the ascii representation of the length of the data file.
1117
PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of
1119
PARENT_SEQUENCE_ID the ascii representation of the sequence number of a
1120
revision id already in the knit that is a parent of REVISION_ID.
1121
The ' :' marker is the end of record marker.
1124
when a write is interrupted to the index file, it will result in a line
1125
that does not end in ' :'. If the ' :' is not present at the end of a line,
1126
or at the end of the file, then the record that is missing it will be
1127
ignored by the parser.
1129
When writing new records to the index file, the data is preceded by '\n'
1130
to ensure that records always start on new lines even if the last write was
1131
interrupted. As a result its normal for the last line in the index to be
1132
missing a trailing newline. One can be added with no harmful effects.
1135
HEADER = "# bzr knit index 8\n"
1137
# speed of knit parsing went from 280 ms to 280 ms with slots addition.
1138
# __slots__ = ['_cache', '_history', '_transport', '_filename']
1140
def _cache_version(self, version_id, options, pos, size, parents):
1141
"""Cache a version record in the history array and index cache.
1143
This is inlined into _load_data for performance. KEEP IN SYNC.
1144
(It saves 60ms, 25% of the __init__ overhead on local 4000 record
1147
# only want the _history index to reference the 1st index entry
1149
if version_id not in self._cache:
1150
index = len(self._history)
1151
self._history.append(version_id)
1153
index = self._cache[version_id][5]
1154
self._cache[version_id] = (version_id,
1161
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1162
create_parent_dir=False, delay_create=False, dir_mode=None):
1163
_KnitComponentFile.__init__(self, transport, filename, mode,
1164
file_mode=file_mode,
1165
create_parent_dir=create_parent_dir,
1168
# position in _history is the 'official' index for a revision
1169
# but the values may have come from a newer entry.
1170
# so - wc -l of a knit index is != the number of unique names
1174
fp = self._transport.get(self._filename)
1176
# _load_data may raise NoSuchFile if the target knit is
1178
_load_data(self, fp)
1182
if mode != 'w' or not create:
1185
self._need_to_create = True
1187
self._transport.put_bytes_non_atomic(
1188
self._filename, self.HEADER, mode=self._file_mode)
1190
def get_graph(self):
1191
"""Return a list of the node:parents lists from this knit index."""
1192
return [(vid, idx[4]) for vid, idx in self._cache.iteritems()]
1194
def get_ancestry(self, versions, topo_sorted=True):
1195
"""See VersionedFile.get_ancestry."""
1196
# get a graph of all the mentioned versions:
1198
pending = set(versions)
1201
version = pending.pop()
1204
parents = [p for p in cache[version][4] if p in cache]
1206
raise RevisionNotPresent(version, self._filename)
1207
# if not completed and not a ghost
1208
pending.update([p for p in parents if p not in graph])
1209
graph[version] = parents
1212
return topo_sort(graph.items())
1214
def get_ancestry_with_ghosts(self, versions):
1215
"""See VersionedFile.get_ancestry_with_ghosts."""
1216
# get a graph of all the mentioned versions:
1217
self.check_versions_present(versions)
1220
pending = set(versions)
1222
version = pending.pop()
1224
parents = cache[version][4]
1230
pending.update([p for p in parents if p not in graph])
1231
graph[version] = parents
1232
return topo_sort(graph.items())
1234
def iter_parents(self, version_ids):
1235
"""Iterate through the parents for many version ids.
1237
:param version_ids: An iterable yielding version_ids.
1238
:return: An iterator that yields (version_id, parents). Requested
1239
version_ids not present in the versioned file are simply skipped.
1240
The order is undefined, allowing for different optimisations in
1241
the underlying implementation.
1243
for version_id in version_ids:
1245
yield version_id, tuple(self.get_parents(version_id))
1249
def num_versions(self):
1250
return len(self._history)
1252
__len__ = num_versions
1254
def get_versions(self):
1255
"""Get all the versions in the file. not topologically sorted."""
1256
return self._history
1258
def _version_list_to_index(self, versions):
1261
for version in versions:
1262
if version in cache:
1263
# -- inlined lookup() --
1264
result_list.append(str(cache[version][5]))
1265
# -- end lookup () --
1267
result_list.append('.' + version)
1268
return ' '.join(result_list)
1270
def add_version(self, version_id, options, pos, size, parents):
1271
"""Add a version record to the index."""
1272
self.add_versions(((version_id, options, pos, size, parents),))
1274
def add_versions(self, versions):
1275
"""Add multiple versions to the index.
1277
:param versions: a list of tuples:
1278
(version_id, options, pos, size, parents).
1281
orig_history = self._history[:]
1282
orig_cache = self._cache.copy()
1285
for version_id, options, pos, size, parents in versions:
1286
line = "\n%s %s %s %s %s :" % (version_id,
1290
self._version_list_to_index(parents))
1291
assert isinstance(line, str), \
1292
'content must be utf-8 encoded: %r' % (line,)
1294
self._cache_version(version_id, options, pos, size, parents)
1295
if not self._need_to_create:
1296
self._transport.append_bytes(self._filename, ''.join(lines))
1299
sio.write(self.HEADER)
1300
sio.writelines(lines)
1302
self._transport.put_file_non_atomic(self._filename, sio,
1303
create_parent_dir=self._create_parent_dir,
1304
mode=self._file_mode,
1305
dir_mode=self._dir_mode)
1306
self._need_to_create = False
1308
# If any problems happen, restore the original values and re-raise
1309
self._history = orig_history
1310
self._cache = orig_cache
1313
def has_version(self, version_id):
1314
"""True if the version is in the index."""
1315
return version_id in self._cache
1317
def get_position(self, version_id):
1318
"""Return data position and size of specified version."""
1319
entry = self._cache[version_id]
1320
return entry[2], entry[3]
1322
def get_method(self, version_id):
1323
"""Return compression method of specified version."""
1324
options = self._cache[version_id][1]
1325
if 'fulltext' in options:
1328
if 'line-delta' not in options:
1329
raise errors.KnitIndexUnknownMethod(self._full_path(), options)
1332
def get_options(self, version_id):
1333
"""Return a string represention options.
1337
return self._cache[version_id][1]
1339
def get_parents(self, version_id):
1340
"""Return parents of specified version ignoring ghosts."""
1341
return [parent for parent in self._cache[version_id][4]
1342
if parent in self._cache]
1344
def get_parents_with_ghosts(self, version_id):
1345
"""Return parents of specified version with ghosts."""
1346
return self._cache[version_id][4]
1348
def check_versions_present(self, version_ids):
1349
"""Check that all specified versions are present."""
1351
for version_id in version_ids:
1352
if version_id not in cache:
1353
raise RevisionNotPresent(version_id, self._filename)
1356
class KnitGraphIndex(object):
1357
"""A knit index that builds on GraphIndex."""
1359
def __init__(self, graph_index, deltas=False, parents=True, add_callback=None):
1360
"""Construct a KnitGraphIndex on a graph_index.
1362
:param graph_index: An implementation of bzrlib.index.GraphIndex.
1363
:param deltas: Allow delta-compressed records.
1364
:param add_callback: If not None, allow additions to the index and call
1365
this callback with a list of added GraphIndex nodes:
1366
[(node, value, node_refs), ...]
1367
:param parents: If True, record knits parents, if not do not record
1370
self._graph_index = graph_index
1371
self._deltas = deltas
1372
self._add_callback = add_callback
1373
self._parents = parents
1374
if deltas and not parents:
1375
raise KnitCorrupt(self, "Cannot do delta compression without "
1378
def _get_entries(self, keys, check_present=False):
1379
"""Get the entries for keys.
1381
:param keys: An iterable of index keys, - 1-tuples.
1386
for node in self._graph_index.iter_entries(keys):
1388
found_keys.add(node[1])
1390
# adapt parentless index to the rest of the code.
1391
for node in self._graph_index.iter_entries(keys):
1392
yield node[0], node[1], node[2], ()
1393
found_keys.add(node[1])
1395
missing_keys = keys.difference(found_keys)
1397
raise RevisionNotPresent(missing_keys.pop(), self)
1399
def _present_keys(self, version_ids):
1401
node[1] for node in self._get_entries(version_ids)])
1403
def _parentless_ancestry(self, versions):
1404
"""Honour the get_ancestry API for parentless knit indices."""
1405
wanted_keys = self._version_ids_to_keys(versions)
1406
present_keys = self._present_keys(wanted_keys)
1407
missing = set(wanted_keys).difference(present_keys)
1409
raise RevisionNotPresent(missing.pop(), self)
1410
return list(self._keys_to_version_ids(present_keys))
1412
def get_ancestry(self, versions, topo_sorted=True):
1413
"""See VersionedFile.get_ancestry."""
1414
if not self._parents:
1415
return self._parentless_ancestry(versions)
1416
# XXX: This will do len(history) index calls - perhaps
1417
# it should be altered to be a index core feature?
1418
# get a graph of all the mentioned versions:
1421
versions = self._version_ids_to_keys(versions)
1422
pending = set(versions)
1424
# get all pending nodes
1425
this_iteration = pending
1426
new_nodes = self._get_entries(this_iteration)
1429
for (index, key, value, node_refs) in new_nodes:
1430
# dont ask for ghosties - otherwise
1431
# we we can end up looping with pending
1432
# being entirely ghosted.
1433
graph[key] = [parent for parent in node_refs[0]
1434
if parent not in ghosts]
1436
for parent in graph[key]:
1437
# dont examine known nodes again
1442
ghosts.update(this_iteration.difference(found))
1443
if versions.difference(graph):
1444
raise RevisionNotPresent(versions.difference(graph).pop(), self)
1446
result_keys = topo_sort(graph.items())
1448
result_keys = graph.iterkeys()
1449
return [key[0] for key in result_keys]
1451
def get_ancestry_with_ghosts(self, versions):
1452
"""See VersionedFile.get_ancestry."""
1453
if not self._parents:
1454
return self._parentless_ancestry(versions)
1455
# XXX: This will do len(history) index calls - perhaps
1456
# it should be altered to be a index core feature?
1457
# get a graph of all the mentioned versions:
1459
versions = self._version_ids_to_keys(versions)
1460
pending = set(versions)
1462
# get all pending nodes
1463
this_iteration = pending
1464
new_nodes = self._get_entries(this_iteration)
1466
for (index, key, value, node_refs) in new_nodes:
1467
graph[key] = node_refs[0]
1469
for parent in graph[key]:
1470
# dont examine known nodes again
1474
missing_versions = this_iteration.difference(graph)
1475
missing_needed = versions.intersection(missing_versions)
1477
raise RevisionNotPresent(missing_needed.pop(), self)
1478
for missing_version in missing_versions:
1479
# add a key, no parents
1480
graph[missing_version] = []
1481
pending.discard(missing_version) # don't look for it
1482
result_keys = topo_sort(graph.items())
1483
return [key[0] for key in result_keys]
1485
def get_graph(self):
1486
"""Return a list of the node:parents lists from this knit index."""
1487
if not self._parents:
1488
return [(key, ()) for key in self.get_versions()]
1490
for index, key, value, refs in self._graph_index.iter_all_entries():
1491
result.append((key[0], tuple([ref[0] for ref in refs[0]])))
1494
def iter_parents(self, version_ids):
1495
"""Iterate through the parents for many version ids.
1497
:param version_ids: An iterable yielding version_ids.
1498
:return: An iterator that yields (version_id, parents). Requested
1499
version_ids not present in the versioned file are simply skipped.
1500
The order is undefined, allowing for different optimisations in
1501
the underlying implementation.
1504
all_nodes = set(self._get_entries(self._version_ids_to_keys(version_ids)))
1506
present_parents = set()
1507
for node in all_nodes:
1508
all_parents.update(node[3][0])
1509
# any node we are querying must be present
1510
present_parents.add(node[1])
1511
unknown_parents = all_parents.difference(present_parents)
1512
present_parents.update(self._present_keys(unknown_parents))
1513
for node in all_nodes:
1515
for parent in node[3][0]:
1516
if parent in present_parents:
1517
parents.append(parent[0])
1518
yield node[1][0], tuple(parents)
1520
for node in self._get_entries(self._version_ids_to_keys(version_ids)):
1521
yield node[1][0], ()
1523
def num_versions(self):
1524
return len(list(self._graph_index.iter_all_entries()))
1526
__len__ = num_versions
1528
def get_versions(self):
1529
"""Get all the versions in the file. not topologically sorted."""
1530
return [node[1][0] for node in self._graph_index.iter_all_entries()]
1532
def has_version(self, version_id):
1533
"""True if the version is in the index."""
1534
return len(self._present_keys(self._version_ids_to_keys([version_id]))) == 1
1536
def _keys_to_version_ids(self, keys):
1537
return tuple(key[0] for key in keys)
1539
def get_position(self, version_id):
1540
"""Return data position and size of specified version."""
1541
bits = self._get_node(version_id)[2][1:].split(' ')
1542
return int(bits[0]), int(bits[1])
1544
def get_method(self, version_id):
1545
"""Return compression method of specified version."""
1546
if not self._deltas:
1548
return self._parent_compression(self._get_node(version_id)[3][1])
1550
def _parent_compression(self, reference_list):
1551
# use the second reference list to decide if this is delta'd or not.
1552
if len(reference_list):
1557
def _get_node(self, version_id):
1558
return list(self._get_entries(self._version_ids_to_keys([version_id])))[0]
1560
def get_options(self, version_id):
1561
"""Return a string represention options.
1565
node = self._get_node(version_id)
1566
if not self._deltas:
1567
options = ['fulltext']
1569
options = [self._parent_compression(node[3][1])]
1570
if node[2][0] == 'N':
1571
options.append('no-eol')
1572
return ','.join(options)
1574
def get_parents(self, version_id):
1575
"""Return parents of specified version ignoring ghosts."""
1576
parents = list(self.iter_parents([version_id]))
1579
raise errors.RevisionNotPresent(version_id, self)
1580
return parents[0][1]
1582
def get_parents_with_ghosts(self, version_id):
1583
"""Return parents of specified version with ghosts."""
1584
nodes = list(self._get_entries(self._version_ids_to_keys([version_id]),
1585
check_present=True))
1586
if not self._parents:
1588
return self._keys_to_version_ids(nodes[0][3][0])
1590
def check_versions_present(self, version_ids):
1591
"""Check that all specified versions are present."""
1592
keys = self._version_ids_to_keys(version_ids)
1593
present = self._present_keys(keys)
1594
missing = keys.difference(present)
1596
raise RevisionNotPresent(missing.pop(), self)
1598
def add_version(self, version_id, options, pos, size, parents):
1599
"""Add a version record to the index."""
1600
return self.add_versions(((version_id, options, pos, size, parents),))
1602
def add_versions(self, versions):
1603
"""Add multiple versions to the index.
1605
This function does not insert data into the Immutable GraphIndex
1606
backing the KnitGraphIndex, instead it prepares data for insertion by
1607
the caller and checks that it is safe to insert then calls
1608
self._add_callback with the prepared GraphIndex nodes.
1610
:param versions: a list of tuples:
1611
(version_id, options, pos, size, parents).
1613
if not self._add_callback:
1614
raise errors.ReadOnlyError(self)
1615
# we hope there are no repositories with inconsistent parentage
1620
for (version_id, options, pos, size, parents) in versions:
1621
# index keys are tuples:
1622
key = (version_id, )
1623
parents = tuple((parent, ) for parent in parents)
1624
if 'no-eol' in options:
1628
value += "%d %d" % (pos, size)
1629
if not self._deltas:
1630
if 'line-delta' in options:
1631
raise KnitCorrupt(self, "attempt to add line-delta in non-delta knit")
1634
if 'line-delta' in options:
1635
node_refs = (parents, (parents[0],))
1637
node_refs = (parents, ())
1639
node_refs = (parents, )
1642
raise KnitCorrupt(self, "attempt to add node with parents "
1643
"in parentless index.")
1645
keys[key] = (value, node_refs)
1646
present_nodes = self._get_entries(keys)
1647
for (index, key, value, node_refs) in present_nodes:
1648
if (value, node_refs) != keys[key]:
1649
raise KnitCorrupt(self, "inconsistent details in add_versions"
1650
": %s %s" % ((value, node_refs), keys[key]))
1654
for key, (value, node_refs) in keys.iteritems():
1655
result.append((key, value, node_refs))
1657
for key, (value, node_refs) in keys.iteritems():
1658
result.append((key, value))
1659
self._add_callback(result)
1661
def _version_ids_to_keys(self, version_ids):
1662
return set((version_id, ) for version_id in version_ids)
1665
class _KnitData(_KnitComponentFile):
1666
"""Contents of the knit data file"""
1668
def __init__(self, transport, filename, mode, create=False, file_mode=None,
1669
create_parent_dir=False, delay_create=False,
1671
_KnitComponentFile.__init__(self, transport, filename, mode,
1672
file_mode=file_mode,
1673
create_parent_dir=create_parent_dir,
1675
self._checked = False
1676
# TODO: jam 20060713 conceptually, this could spill to disk
1677
# if the cached size gets larger than a certain amount
1678
# but it complicates the model a bit, so for now just use
1679
# a simple dictionary
1681
self._do_cache = False
1684
self._need_to_create = create
1686
self._transport.put_bytes_non_atomic(self._filename, '',
1687
mode=self._file_mode)
1689
def enable_cache(self):
1690
"""Enable caching of reads."""
1691
self._do_cache = True
1693
def clear_cache(self):
1694
"""Clear the record cache."""
1695
self._do_cache = False
1698
def _open_file(self):
1700
return self._transport.get(self._filename)
1705
def _record_to_data(self, version_id, digest, lines):
1706
"""Convert version_id, digest, lines into a raw data block.
1708
:return: (len, a StringIO instance with the raw data ready to read.)
1711
data_file = GzipFile(None, mode='wb', fileobj=sio)
1713
assert isinstance(version_id, str)
1714
data_file.writelines(chain(
1715
["version %s %d %s\n" % (version_id,
1719
["end %s\n" % version_id]))
1726
def add_raw_record(self, raw_data):
1727
"""Append a prepared record to the data file.
1729
:return: the offset in the data file raw_data was written.
1731
assert isinstance(raw_data, str), 'data must be plain bytes'
1732
if not self._need_to_create:
1733
return self._transport.append_bytes(self._filename, raw_data)
1735
self._transport.put_bytes_non_atomic(self._filename, raw_data,
1736
create_parent_dir=self._create_parent_dir,
1737
mode=self._file_mode,
1738
dir_mode=self._dir_mode)
1739
self._need_to_create = False
1742
def add_record(self, version_id, digest, lines):
1743
"""Write new text record to disk. Returns the position in the
1744
file where it was written."""
1745
size, sio = self._record_to_data(version_id, digest, lines)
1747
if not self._need_to_create:
1748
start_pos = self._transport.append_file(self._filename, sio)
1750
self._transport.put_file_non_atomic(self._filename, sio,
1751
create_parent_dir=self._create_parent_dir,
1752
mode=self._file_mode,
1753
dir_mode=self._dir_mode)
1754
self._need_to_create = False
1757
self._cache[version_id] = sio.getvalue()
1758
return start_pos, size
1760
def _parse_record_header(self, version_id, raw_data):
1761
"""Parse a record header for consistency.
1763
:return: the header and the decompressor stream.
1764
as (stream, header_record)
1766
df = GzipFile(mode='rb', fileobj=StringIO(raw_data))
1768
rec = self._check_header(version_id, df.readline())
1769
except Exception, e:
1770
raise KnitCorrupt(self._filename,
1771
"While reading {%s} got %s(%s)"
1772
% (version_id, e.__class__.__name__, str(e)))
1775
def _check_header(self, version_id, line):
1778
raise KnitCorrupt(self._filename,
1779
'unexpected number of elements in record header')
1780
if rec[1] != version_id:
1781
raise KnitCorrupt(self._filename,
1782
'unexpected version, wanted %r, got %r'
1783
% (version_id, rec[1]))
1786
def _parse_record(self, version_id, data):
1788
# 4168 calls in 2880 217 internal
1789
# 4168 calls to _parse_record_header in 2121
1790
# 4168 calls to readlines in 330
1791
df = GzipFile(mode='rb', fileobj=StringIO(data))
1794
record_contents = df.readlines()
1795
except Exception, e:
1796
raise KnitCorrupt(self._filename,
1797
"While reading {%s} got %s(%s)"
1798
% (version_id, e.__class__.__name__, str(e)))
1799
header = record_contents.pop(0)
1800
rec = self._check_header(version_id, header)
1802
last_line = record_contents.pop()
1803
if len(record_contents) != int(rec[2]):
1804
raise KnitCorrupt(self._filename,
1805
'incorrect number of lines %s != %s'
1807
% (len(record_contents), int(rec[2]),
1809
if last_line != 'end %s\n' % rec[1]:
1810
raise KnitCorrupt(self._filename,
1811
'unexpected version end line %r, wanted %r'
1812
% (last_line, version_id))
1814
return record_contents, rec[3]
1816
def read_records_iter_raw(self, records):
1817
"""Read text records from data file and yield raw data.
1819
This unpacks enough of the text record to validate the id is
1820
as expected but thats all.
1822
# setup an iterator of the external records:
1823
# uses readv so nice and fast we hope.
1825
# grab the disk data needed.
1827
# Don't check _cache if it is empty
1828
needed_offsets = [(pos, size) for version_id, pos, size
1830
if version_id not in self._cache]
1832
needed_offsets = [(pos, size) for version_id, pos, size
1835
raw_records = self._transport.readv(self._filename, needed_offsets)
1837
for version_id, pos, size in records:
1838
if version_id in self._cache:
1839
# This data has already been validated
1840
data = self._cache[version_id]
1842
pos, data = raw_records.next()
1844
self._cache[version_id] = data
1846
# validate the header
1847
df, rec = self._parse_record_header(version_id, data)
1849
yield version_id, data
1851
def read_records_iter(self, records):
1852
"""Read text records from data file and yield result.
1854
The result will be returned in whatever is the fastest to read.
1855
Not by the order requested. Also, multiple requests for the same
1856
record will only yield 1 response.
1857
:param records: A list of (version_id, pos, len) entries
1858
:return: Yields (version_id, contents, digest) in the order
1859
read, not the order requested
1865
# Skip records we have alread seen
1866
yielded_records = set()
1867
needed_records = set()
1868
for record in records:
1869
if record[0] in self._cache:
1870
if record[0] in yielded_records:
1872
yielded_records.add(record[0])
1873
data = self._cache[record[0]]
1874
content, digest = self._parse_record(record[0], data)
1875
yield (record[0], content, digest)
1877
needed_records.add(record)
1878
needed_records = sorted(needed_records, key=operator.itemgetter(1))
1880
needed_records = sorted(set(records), key=operator.itemgetter(1))
1882
if not needed_records:
1885
# The transport optimizes the fetching as well
1886
# (ie, reads continuous ranges.)
1887
readv_response = self._transport.readv(self._filename,
1888
[(pos, size) for version_id, pos, size in needed_records])
1890
for (version_id, pos, size), (pos, data) in \
1891
izip(iter(needed_records), readv_response):
1892
content, digest = self._parse_record(version_id, data)
1894
self._cache[version_id] = data
1895
yield version_id, content, digest
1897
def read_records(self, records):
1898
"""Read records into a dictionary."""
1900
for record_id, content, digest in \
1901
self.read_records_iter(records):
1902
components[record_id] = (content, digest)
1906
class InterKnit(InterVersionedFile):
1907
"""Optimised code paths for knit to knit operations."""
1909
_matching_file_from_factory = KnitVersionedFile
1910
_matching_file_to_factory = KnitVersionedFile
1913
def is_compatible(source, target):
1914
"""Be compatible with knits. """
1916
return (isinstance(source, KnitVersionedFile) and
1917
isinstance(target, KnitVersionedFile))
1918
except AttributeError:
1921
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
1922
"""See InterVersionedFile.join."""
1923
assert isinstance(self.source, KnitVersionedFile)
1924
assert isinstance(self.target, KnitVersionedFile)
1926
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
1931
pb = ui.ui_factory.nested_progress_bar()
1933
version_ids = list(version_ids)
1934
if None in version_ids:
1935
version_ids.remove(None)
1937
self.source_ancestry = set(self.source.get_ancestry(version_ids))
1938
this_versions = set(self.target._index.get_versions())
1939
needed_versions = self.source_ancestry - this_versions
1940
cross_check_versions = self.source_ancestry.intersection(this_versions)
1941
mismatched_versions = set()
1942
for version in cross_check_versions:
1943
# scan to include needed parents.
1944
n1 = set(self.target.get_parents_with_ghosts(version))
1945
n2 = set(self.source.get_parents_with_ghosts(version))
1947
# FIXME TEST this check for cycles being introduced works
1948
# the logic is we have a cycle if in our graph we are an
1949
# ancestor of any of the n2 revisions.
1955
parent_ancestors = self.source.get_ancestry(parent)
1956
if version in parent_ancestors:
1957
raise errors.GraphCycleError([parent, version])
1958
# ensure this parent will be available later.
1959
new_parents = n2.difference(n1)
1960
needed_versions.update(new_parents.difference(this_versions))
1961
mismatched_versions.add(version)
1963
if not needed_versions and not mismatched_versions:
1965
full_list = topo_sort(self.source.get_graph())
1967
version_list = [i for i in full_list if (not self.target.has_version(i)
1968
and i in needed_versions)]
1972
copy_queue_records = []
1974
for version_id in version_list:
1975
options = self.source._index.get_options(version_id)
1976
parents = self.source._index.get_parents_with_ghosts(version_id)
1977
# check that its will be a consistent copy:
1978
for parent in parents:
1979
# if source has the parent, we must :
1980
# * already have it or
1981
# * have it scheduled already
1982
# otherwise we don't care
1983
assert (self.target.has_version(parent) or
1984
parent in copy_set or
1985
not self.source.has_version(parent))
1986
data_pos, data_size = self.source._index.get_position(version_id)
1987
copy_queue_records.append((version_id, data_pos, data_size))
1988
copy_queue.append((version_id, options, parents))
1989
copy_set.add(version_id)
1991
# data suck the join:
1993
total = len(version_list)
1996
for (version_id, raw_data), \
1997
(version_id2, options, parents) in \
1998
izip(self.source._data.read_records_iter_raw(copy_queue_records),
2000
assert version_id == version_id2, 'logic error, inconsistent results'
2002
pb.update("Joining knit", count, total)
2003
raw_records.append((version_id, options, parents, len(raw_data)))
2004
raw_datum.append(raw_data)
2005
self.target._add_raw_records(raw_records, ''.join(raw_datum))
2007
for version in mismatched_versions:
2008
# FIXME RBC 20060309 is this needed?
2009
n1 = set(self.target.get_parents_with_ghosts(version))
2010
n2 = set(self.source.get_parents_with_ghosts(version))
2011
# write a combined record to our history preserving the current
2012
# parents as first in the list
2013
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
2014
self.target.fix_parents(version, new_parents)
2020
InterVersionedFile.register_optimiser(InterKnit)
2023
class WeaveToKnit(InterVersionedFile):
2024
"""Optimised code paths for weave to knit operations."""
2026
_matching_file_from_factory = bzrlib.weave.WeaveFile
2027
_matching_file_to_factory = KnitVersionedFile
2030
def is_compatible(source, target):
2031
"""Be compatible with weaves to knits."""
2033
return (isinstance(source, bzrlib.weave.Weave) and
2034
isinstance(target, KnitVersionedFile))
2035
except AttributeError:
2038
def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):
2039
"""See InterVersionedFile.join."""
2040
assert isinstance(self.source, bzrlib.weave.Weave)
2041
assert isinstance(self.target, KnitVersionedFile)
2043
version_ids = self._get_source_version_ids(version_ids, ignore_missing)
2048
pb = ui.ui_factory.nested_progress_bar()
2050
version_ids = list(version_ids)
2052
self.source_ancestry = set(self.source.get_ancestry(version_ids))
2053
this_versions = set(self.target._index.get_versions())
2054
needed_versions = self.source_ancestry - this_versions
2055
cross_check_versions = self.source_ancestry.intersection(this_versions)
2056
mismatched_versions = set()
2057
for version in cross_check_versions:
2058
# scan to include needed parents.
2059
n1 = set(self.target.get_parents_with_ghosts(version))
2060
n2 = set(self.source.get_parents(version))
2061
# if all of n2's parents are in n1, then its fine.
2062
if n2.difference(n1):
2063
# FIXME TEST this check for cycles being introduced works
2064
# the logic is we have a cycle if in our graph we are an
2065
# ancestor of any of the n2 revisions.
2071
parent_ancestors = self.source.get_ancestry(parent)
2072
if version in parent_ancestors:
2073
raise errors.GraphCycleError([parent, version])
2074
# ensure this parent will be available later.
2075
new_parents = n2.difference(n1)
2076
needed_versions.update(new_parents.difference(this_versions))
2077
mismatched_versions.add(version)
2079
if not needed_versions and not mismatched_versions:
2081
full_list = topo_sort(self.source.get_graph())
2083
version_list = [i for i in full_list if (not self.target.has_version(i)
2084
and i in needed_versions)]
2088
total = len(version_list)
2089
for version_id in version_list:
2090
pb.update("Converting to knit", count, total)
2091
parents = self.source.get_parents(version_id)
2092
# check that its will be a consistent copy:
2093
for parent in parents:
2094
# if source has the parent, we must already have it
2095
assert (self.target.has_version(parent))
2096
self.target.add_lines(
2097
version_id, parents, self.source.get_lines(version_id))
2100
for version in mismatched_versions:
2101
# FIXME RBC 20060309 is this needed?
2102
n1 = set(self.target.get_parents_with_ghosts(version))
2103
n2 = set(self.source.get_parents(version))
2104
# write a combined record to our history preserving the current
2105
# parents as first in the list
2106
new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))
2107
self.target.fix_parents(version, new_parents)
2113
InterVersionedFile.register_optimiser(WeaveToKnit)
2116
class KnitSequenceMatcher(difflib.SequenceMatcher):
2117
"""Knit tuned sequence matcher.
2119
This is based on profiling of difflib which indicated some improvements
2120
for our usage pattern.
2123
def find_longest_match(self, alo, ahi, blo, bhi):
2124
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
2126
If isjunk is not defined:
2128
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
2129
alo <= i <= i+k <= ahi
2130
blo <= j <= j+k <= bhi
2131
and for all (i',j',k') meeting those conditions,
2134
and if i == i', j <= j'
2136
In other words, of all maximal matching blocks, return one that
2137
starts earliest in a, and of all those maximal matching blocks that
2138
start earliest in a, return the one that starts earliest in b.
2140
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
2141
>>> s.find_longest_match(0, 5, 0, 9)
2144
If isjunk is defined, first the longest matching block is
2145
determined as above, but with the additional restriction that no
2146
junk element appears in the block. Then that block is extended as
2147
far as possible by matching (only) junk elements on both sides. So
2148
the resulting block never matches on junk except as identical junk
2149
happens to be adjacent to an "interesting" match.
2151
Here's the same example as before, but considering blanks to be
2152
junk. That prevents " abcd" from matching the " abcd" at the tail
2153
end of the second sequence directly. Instead only the "abcd" can
2154
match, and matches the leftmost "abcd" in the second sequence:
2156
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
2157
>>> s.find_longest_match(0, 5, 0, 9)
2160
If no blocks match, return (alo, blo, 0).
2162
>>> s = SequenceMatcher(None, "ab", "c")
2163
>>> s.find_longest_match(0, 2, 0, 1)
2167
# CAUTION: stripping common prefix or suffix would be incorrect.
2171
# Longest matching block is "ab", but if common prefix is
2172
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
2173
# strip, so ends up claiming that ab is changed to acab by
2174
# inserting "ca" in the middle. That's minimal but unintuitive:
2175
# "it's obvious" that someone inserted "ac" at the front.
2176
# Windiff ends up at the same place as diff, but by pairing up
2177
# the unique 'b's and then matching the first two 'a's.
2179
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
2180
besti, bestj, bestsize = alo, blo, 0
2181
# find longest junk-free match
2182
# during an iteration of the loop, j2len[j] = length of longest
2183
# junk-free match ending with a[i-1] and b[j]
2187
for i in xrange(alo, ahi):
2188
# look at all instances of a[i] in b; note that because
2189
# b2j has no junk keys, the loop is skipped if a[i] is junk
2190
j2lenget = j2len.get
2193
# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the
2194
# following improvement
2195
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
2196
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
2197
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
2199
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
2200
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
2201
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
2213
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
2215
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
2218
# Extend the best by non-junk elements on each end. In particular,
2219
# "popular" non-junk elements aren't in b2j, which greatly speeds
2220
# the inner loop above, but also means "the best" match so far
2221
# doesn't contain any junk *or* popular non-junk elements.
2222
while besti > alo and bestj > blo and \
2223
not isbjunk(b[bestj-1]) and \
2224
a[besti-1] == b[bestj-1]:
2225
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2226
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2227
not isbjunk(b[bestj+bestsize]) and \
2228
a[besti+bestsize] == b[bestj+bestsize]:
2231
# Now that we have a wholly interesting match (albeit possibly
2232
# empty!), we may as well suck up the matching junk on each
2233
# side of it too. Can't think of a good reason not to, and it
2234
# saves post-processing the (possibly considerable) expense of
2235
# figuring out what to do with it. In the case of an empty
2236
# interesting match, this is clearly the right thing to do,
2237
# because no other kind of match is possible in the regions.
2238
while besti > alo and bestj > blo and \
2239
isbjunk(b[bestj-1]) and \
2240
a[besti-1] == b[bestj-1]:
2241
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
2242
while besti+bestsize < ahi and bestj+bestsize < bhi and \
2243
isbjunk(b[bestj+bestsize]) and \
2244
a[besti+bestsize] == b[bestj+bestsize]:
2245
bestsize = bestsize + 1
2247
return besti, bestj, bestsize
2251
from bzrlib._knit_load_data_c import _load_data_c as _load_data
2253
from bzrlib._knit_load_data_py import _load_data_py as _load_data