/brz/remove-bazaar : revision 2155.1.1

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: John Arbash Meinel
Date: 2006-11-29 17:16:58 UTC
mto: This revision was merged to the branch mainline in revision 2156.
Revision ID: john@arbash-meinel.com-20061129171658-uwphkz5ntsb7bv0r

(Dmitry Vasiliev) pre-lookup encoders to improve performance

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

COPYING.txt

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzr.ico

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_bundle.py

bzrlib/benchmarks/bench_cache_utf8.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_info.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_sftp.py

bzrlib/benchmarks/bench_startup.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/benchmarks/bench_xml.py

bzrlib/benchmarks/tree_creator

bzrlib/benchmarks/tree_creator/__init__.py

bzrlib/benchmarks/tree_creator/heavily_merged.py

bzrlib/benchmarks/tree_creator/kernel_like.py

bzrlib/benchmarks/tree_creator/many_commit.py

bzrlib/benchmarks/tree_creator/simple_many_commit.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle

bzrlib/bundle/__init__.py

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/bundle/serializer/v09.py

bzrlib/bzrdir.py

bzrlib/cache_utf8.py

bzrlib/check.py

bzrlib/cmd_version_info.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/debug.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/generate_ids.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/help_topics.py

bzrlib/identitymap.py

bzrlib/ignores.py

bzrlib/info.py

bzrlib/inspect_for_copy.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lazy_import.py

bzrlib/lazy_regex.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/memorytree.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/mutabletree.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/registry.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/revisiontree.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/HttpServer.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_bundle.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_debug.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_inventory.py

bzrlib/tests/blackbox/test_locale.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_nick.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_remove_tree.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_serve.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_testament.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_version_info.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_http.py

bzrlib/tests/branch_implementations/test_locking.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/intertree_implementations

bzrlib/tests/intertree_implementations/__init__.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/lock_helpers.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/repository_implementations/test_revision.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_api.py

bzrlib/tests/test_atomicfile.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_cache_utf8.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_ftp_transport.py

bzrlib/tests/test_generate_ids.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_http_response.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_ignores.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lazy_import.py

bzrlib/tests/test_lazy_regex.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_memorytree.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_registry.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revert.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_smart_transport.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tree.py

bzrlib/tests/test_treebuilder.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_version.py

bzrlib/tests/test_version_info.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_wsgi.py

bzrlib/tests/test_xml.py

bzrlib/tests/tree_implementations

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_test_trees.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_changes_from.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_flush.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_locking.py

bzrlib/tests/workingtree_implementations/test_merge_from_branch.py

bzrlib/tests/workingtree_implementations/test_mkdir.py

bzrlib/tests/workingtree_implementations/test_parents.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_put_file.py

bzrlib/tests/workingtree_implementations/test_read_working_inventory.py

bzrlib/tests/workingtree_implementations/test_set_root_id.py

bzrlib/tests/workingtree_implementations/test_unversion.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textfile.py

bzrlib/textinv.py

bzrlib/textmerge.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/chroot.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_pycurl_errors.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/http/_urllib2_wrappers.py

bzrlib/transport/http/response.py

bzrlib/transport/http/wsgi.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/transport/smart.py

bzrlib/transport/ssh.py

bzrlib/tree.py

bzrlib/treebuilder.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/urlutils.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/version.py

bzrlib/version_info_formats

bzrlib/version_info_formats/__init__.py

bzrlib/version_info_formats/format_python.py

bzrlib/version_info_formats/format_rio.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml6.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/README.1st

doc/bazaar-vcs.org.kid

doc/centralized_workflow.txt

doc/configuration.txt

doc/default.css

doc/http_smart_server.txt

doc/index.txt

doc/plugins.txt

doc/server.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/tutorial.txt

doc/using_aliases.txt

doc/version_info.txt

generate_docs.py

profile_imports.py

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/doc_generate/autodoc_rstx.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/rst2html.py

tools/rst2prettyhtml.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tools/win32

tools/win32/__init__.py

tools/win32/bazaar.url

tools/win32/bzr-win32-bdist-postinstall.py

tools/win32/bzr.iss.cog

tools/win32/bzr_postinstall.py

tools/win32/file_version.py

tools/win32/info.txt

tools/win32/ostools.py

tools/win32/start_bzr.bat

files removed:
.bzrignore

COPYING

HACKING

INSTALL

Makefile

NEWS

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

cache.py

commands.py

commit.py

config.py

dir.py

errors.py

fetch.py

help.py

hg.py

info.py

inventory.py

mapping.py

notes

notes/git-serve.txt

notes/mapping.txt

notes/roundtripping.txt

object_store.py

push.py

refs.py

remote.py

repository.py

revspec.py

roundtrip.py

send.py

server.py

setup.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_cache.py

tests/test_dir.py

tests/test_fetch.py

tests/test_mapping.py

tests/test_object_store.py

tests/test_push.py

tests/test_refs.py

tests/test_remote.py

tests/test_repository.py

tests/test_revspec.py

tests/test_roundtrip.py

tests/test_transportgit.py

transportgit.py

tree.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

import sys

import warnings

import bzrlib

from bzrlib import (

cache_utf8,

errors,

patiencediff,

progress,

)

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import GzipFile

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed

from bzrlib.tsort import topo_sort

import bzrlib.weave

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

100

101

# TODO: accommodate binaries, perhaps by storing a byte count

102

103

# TODO: function to check whole file

104

105

# TODO: atomically append data, then measure backwards from the cursor

106

# position after writing to work out where it was located. we may need to

107

# bypass python file buffering.

108

109

DATA_SUFFIX = '.knit'

110

INDEX_SUFFIX = '.kndx'

111

112

113

class KnitContent(object):

114

"""Content of a knit version to which deltas can be applied."""

115

116

def __init__(self, lines):

117

self._lines = lines

118

119

def annotate_iter(self):

120

"""Yield tuples of (origin, text) for each content line."""

121

return iter(self._lines)

122

123

def annotate(self):

124

"""Return a list of (origin, text) tuples."""

125

return list(self.annotate_iter())

126

127

def line_delta_iter(self, new_lines):

128

"""Generate line-based delta from this content to new_lines."""

129

new_texts = new_lines.text()

130

old_texts = self.text()

131

s = KnitSequenceMatcher(None, old_texts, new_texts)

132

for tag, i1, i2, j1, j2 in s.get_opcodes():

133

if tag == 'equal':

134

continue

135

# ofrom, oto, length, data

136

yield i1, i2, j2 - j1, new_lines._lines[j1:j2]

137

138

def line_delta(self, new_lines):

139

return list(self.line_delta_iter(new_lines))

140

141

def text(self):

142

return [text for origin, text in self._lines]

143

144

def copy(self):

145

return KnitContent(self._lines[:])

146

147

148

class _KnitFactory(object):

149

"""Base factory for creating content objects."""

150

151

def make(self, lines, version):

152

num_lines = len(lines)

153

return KnitContent(zip([version] * num_lines, lines))

154

155

156

class KnitAnnotateFactory(_KnitFactory):

157

"""Factory for creating annotated Content objects."""

158

159

annotated = True

160

161

def parse_fulltext(self, content, version):

162

"""Convert fulltext to internal representation

163

164

fulltext content is of the format

165

revid(utf8) plaintext\n

166

internal representation is of the format:

167

(revid, plaintext)

168

"""

169

decode_utf8 = cache_utf8.decode

170

lines = []

171

for line in content:

172

origin, text = line.split(' ', 1)

173

lines.append((decode_utf8(origin), text))

174

return KnitContent(lines)

175

176

def parse_line_delta_iter(self, lines):

177

for result_item in self.parse_line_delta[lines]:

178

yield result_item

179

180

def parse_line_delta(self, lines, version):

181

"""Convert a line based delta into internal representation.

182

183

line delta is in the form of:

184

intstart intend intcount

185

1..count lines:

186

revid(utf8) newline\n

187

internal representation is

188

(start, end, count, [1..count tuples (revid, newline)])

189

"""

190

decode_utf8 = cache_utf8.decode

191

result = []

192

lines = iter(lines)

193

next = lines.next

194

# walk through the lines parsing.

195

for header in lines:

196

start, end, count = [int(n) for n in header.split(',')]

197

contents = []

198

remaining = count

199

while remaining:

200

origin, text = next().split(' ', 1)

201

remaining -= 1

202

contents.append((decode_utf8(origin), text))

203

result.append((start, end, count, contents))

204

return result

205

206

def lower_fulltext(self, content):

207

"""convert a fulltext content record into a serializable form.

208

209

see parse_fulltext which this inverts.

210

"""

211

encode_utf8 = cache_utf8.encode

212

return ['%s %s' % (encode_utf8(o), t) for o, t in content._lines]

213

214

def lower_line_delta(self, delta):

215

"""convert a delta into a serializable form.

216

217

See parse_line_delta which this inverts.

218

"""

219

encode_utf8 = cache_utf8.encode

220

out = []

221

for start, end, c, lines in delta:

222

out.append('%d,%d,%d\n' % (start, end, c))

223

out.extend(encode_utf8(origin) + ' ' + text

224

for origin, text in lines)

225

return out

226

227

228

class KnitPlainFactory(_KnitFactory):

229

"""Factory for creating plain Content objects."""

230

231

annotated = False

232

233

def parse_fulltext(self, content, version):

234

"""This parses an unannotated fulltext.

235

236

Note that this is not a noop - the internal representation

237

has (versionid, line) - its just a constant versionid.

238

"""

239

return self.make(content, version)

240

241

def parse_line_delta_iter(self, lines, version):

242

while lines:

243

header = lines.pop(0)

244

start, end, c = [int(n) for n in header.split(',')]

245

yield start, end, c, zip([version] * c, lines[:c])

246

del lines[:c]

247

248

def parse_line_delta(self, lines, version):

249

return list(self.parse_line_delta_iter(lines, version))

250

251

def lower_fulltext(self, content):

252

return content.text()

253

254

def lower_line_delta(self, delta):

255

out = []

256

for start, end, c, lines in delta:

257

out.append('%d,%d,%d\n' % (start, end, c))

258

out.extend([text for origin, text in lines])

259

return out

260

261

262

def make_empty_knit(transport, relpath):

263

"""Construct a empty knit at the specified location."""

264

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

265

k._data._open_file()

266

267

268

class KnitVersionedFile(VersionedFile):

269

"""Weave-like structure with faster random access.

270

271

A knit stores a number of texts and a summary of the relationships

272

between them. Texts are identified by a string version-id. Texts

273

are normally stored and retrieved as a series of lines, but can

274

also be passed as single strings.

275

276

Lines are stored with the trailing newline (if any) included, to

277

avoid special cases for files with no final newline. Lines are

278

composed of 8-bit characters, not unicode. The combination of

279

these approaches should mean any 'binary' file can be safely

280

stored and retrieved.

281

"""

282

283

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

284

factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,

285

create=False, create_parent_dir=False, delay_create=False,

286

dir_mode=None):

287

"""Construct a knit at location specified by relpath.

288

289

:param create: If not True, only open an existing knit.

290

:param create_parent_dir: If True, create the parent directory if

291

creating the file fails. (This is used for stores with

292

hash-prefixes that may not exist yet)

293

:param delay_create: The calling code is aware that the knit won't

294

actually be created until the first data is stored.

295

"""

296

if deprecated_passed(basis_knit):

297

warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"

298

" deprecated as of bzr 0.9.",

299

DeprecationWarning, stacklevel=2)

300

if access_mode is None:

301

access_mode = 'w'

302

super(KnitVersionedFile, self).__init__(access_mode)

303

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

304

self.transport = transport

305

self.filename = relpath

306

self.factory = factory or KnitAnnotateFactory()

307

self.writable = (access_mode == 'w')

308

self.delta = delta

309

310

self._max_delta_chain = 200

311

312

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

313

access_mode, create=create, file_mode=file_mode,

314

create_parent_dir=create_parent_dir, delay_create=delay_create,

315

dir_mode=dir_mode)

316

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

317

access_mode, create=create and not len(self), file_mode=file_mode,

318

create_parent_dir=create_parent_dir, delay_create=delay_create,

319

dir_mode=dir_mode)

320

321

def __repr__(self):

322

return '%s(%s)' % (self.__class__.__name__,

323

self.transport.abspath(self.filename))

324

325

def _check_should_delta(self, first_parents):

326

"""Iterate back through the parent listing, looking for a fulltext.

327

328

This is used when we want to decide whether to add a delta or a new

329

fulltext. It searches for _max_delta_chain parents. When it finds a

330

fulltext parent, it sees if the total size of the deltas leading up to

331

it is large enough to indicate that we want a new full text anyway.

332

333

Return True if we should create a new delta, False if we should use a

334

full text.

335

"""

336

delta_size = 0

337

fulltext_size = None

338

delta_parents = first_parents

339

for count in xrange(self._max_delta_chain):

340

parent = delta_parents[0]

341

method = self._index.get_method(parent)

342

pos, size = self._index.get_position(parent)

343

if method == 'fulltext':

344

fulltext_size = size

345

break

346

delta_size += size

347

delta_parents = self._index.get_parents(parent)

348

else:

349

# We couldn't find a fulltext, so we must create a new one

350

return False

351

352

return fulltext_size > delta_size

353

354

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

355

"""See VersionedFile._add_delta()."""

356

self._check_add(version_id, []) # should we check the lines ?

357

self._check_versions_present(parents)

358

present_parents = []

359

ghosts = []

360

parent_texts = {}

361

for parent in parents:

362

if not self.has_version(parent):

363

ghosts.append(parent)

364

else:

365

present_parents.append(parent)

366

367

if delta_parent is None:

368

# reconstitute as full text.

369

assert len(delta) == 1 or len(delta) == 0

370

if len(delta):

371

assert delta[0][0] == 0

372

assert delta[0][1] == 0, delta[0][1]

373

return super(KnitVersionedFile, self)._add_delta(version_id,

374

parents,

375

delta_parent,

376

sha1,

377

noeol,

378

delta)

379

380

digest = sha1

381

382

options = []

383

if noeol:

384

options.append('no-eol')

385

386

if delta_parent is not None:

387

# determine the current delta chain length.

388

# To speed the extract of texts the delta chain is limited

389

# to a fixed number of deltas. This should minimize both

390

# I/O and the time spend applying deltas.

391

# The window was changed to a maximum of 200 deltas, but also added

392

# was a check that the total compressed size of the deltas is

393

# smaller than the compressed size of the fulltext.

394

if not self._check_should_delta([delta_parent]):

395

# We don't want a delta here, just do a normal insertion.

396

return super(KnitVersionedFile, self)._add_delta(version_id,

397

parents,

398

delta_parent,

399

sha1,

400

noeol,

401

delta)

402

403

options.append('line-delta')

404

store_lines = self.factory.lower_line_delta(delta)

405

406

where, size = self._data.add_record(version_id, digest, store_lines)

407

self._index.add_version(version_id, options, where, size, parents)

408

409

def _add_raw_records(self, records, data):

410

"""Add all the records 'records' with data pre-joined in 'data'.

411

412

:param records: A list of tuples(version_id, options, parents, size).

413

:param data: The data for the records. When it is written, the records

414

are adjusted to have pos pointing into data by the sum of

415

the preceding records sizes.

416

"""

417

# write all the data

418

pos = self._data.add_raw_record(data)

419

offset = 0

420

index_entries = []

421

for (version_id, options, parents, size) in records:

422

index_entries.append((version_id, options, pos+offset,

423

size, parents))

424

if self._data._do_cache:

425

self._data._cache[version_id] = data[offset:offset+size]

426

offset += size

427

self._index.add_versions(index_entries)

428

429

def enable_cache(self):

430

"""Start caching data for this knit"""

431

self._data.enable_cache()

432

433

def clear_cache(self):

434

"""Clear the data cache only."""

435

self._data.clear_cache()

436

437

def copy_to(self, name, transport):

438

"""See VersionedFile.copy_to()."""

439

# copy the current index to a temp index to avoid racing with local

440

# writes

441

transport.put_file_non_atomic(name + INDEX_SUFFIX + '.tmp',

442

self.transport.get(self._index._filename))

443

# copy the data file

444

f = self._data._open_file()

445

try:

446

transport.put_file(name + DATA_SUFFIX, f)

447

finally:

448

f.close()

449

# move the copied index into place

450

transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

451

452

def create_empty(self, name, transport, mode=None):

453

return KnitVersionedFile(name, transport, factory=self.factory,

454

delta=self.delta, create=True)

455

456

def _fix_parents(self, version, new_parents):

457

"""Fix the parents list for version.

458

459

This is done by appending a new version to the index

460

with identical data except for the parents list.

461

the parents list must be a superset of the current

462

list.

463

"""

464

current_values = self._index._cache[version]

465

assert set(current_values[4]).difference(set(new_parents)) == set()

466

self._index.add_version(version,

467

current_values[1],

468

current_values[2],

469

current_values[3],

470

new_parents)

471

472

def get_delta(self, version_id):

473

"""Get a delta for constructing version from some other version."""

474

if not self.has_version(version_id):

475

raise RevisionNotPresent(version_id, self.filename)

476

477

parents = self.get_parents(version_id)

478

if len(parents):

479

parent = parents[0]

480

else:

481

parent = None

482

data_pos, data_size = self._index.get_position(version_id)

483

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

484

version_idx = self._index.lookup(version_id)

485

noeol = 'no-eol' in self._index.get_options(version_id)

486

if 'fulltext' == self._index.get_method(version_id):

487

new_content = self.factory.parse_fulltext(data, version_idx)

488

if parent is not None:

489

reference_content = self._get_content(parent)

490

old_texts = reference_content.text()

491

else:

492

old_texts = []

493

new_texts = new_content.text()

494

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

495

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

496

else:

497

delta = self.factory.parse_line_delta(data, version_idx)

498

return parent, sha1, noeol, delta

499

500

def get_graph_with_ghosts(self):

501

"""See VersionedFile.get_graph_with_ghosts()."""

502

graph_items = self._index.get_graph()

503

return dict(graph_items)

504

505

def get_sha1(self, version_id):

506

"""See VersionedFile.get_sha1()."""

507

record_map = self._get_record_map([version_id])

508

method, content, digest, next = record_map[version_id]

509

return digest

510

511

@staticmethod

512

def get_suffixes():

513

"""See VersionedFile.get_suffixes()."""

514

return [DATA_SUFFIX, INDEX_SUFFIX]

515

516

def has_ghost(self, version_id):

517

"""True if there is a ghost reference in the file to version_id."""

518

# maybe we have it

519

if self.has_version(version_id):

520

return False

521

# optimisable if needed by memoising the _ghosts set.

522

items = self._index.get_graph()

523

for node, parents in items:

524

for parent in parents:

525

if parent not in self._index._cache:

526

if parent == version_id:

527

return True

528

return False

529

530

def versions(self):

531

"""See VersionedFile.versions."""

532

return self._index.get_versions()

533

534

def has_version(self, version_id):

535

"""See VersionedFile.has_version."""

536

return self._index.has_version(version_id)

537

538

__contains__ = has_version

539

540

def _merge_annotations(self, content, parents, parent_texts={},

541

delta=None, annotated=None):

542

"""Merge annotations for content. This is done by comparing

543

the annotations based on changed to the text.

544

"""

545

if annotated:

546

delta_seq = None

547

for parent_id in parents:

548

merge_content = self._get_content(parent_id, parent_texts)

549

seq = patiencediff.PatienceSequenceMatcher(

550

None, merge_content.text(), content.text())

551

if delta_seq is None:

552

# setup a delta seq to reuse.

553

delta_seq = seq

554

for i, j, n in seq.get_matching_blocks():

555

if n == 0:

556

continue

557

# this appears to copy (origin, text) pairs across to the new

558

# content for any line that matches the last-checked parent.

559

# FIXME: save the sequence control data for delta compression

560

# against the most relevant parent rather than rediffing.

561

content._lines[j:j+n] = merge_content._lines[i:i+n]

562

if delta:

563

if not annotated:

564

reference_content = self._get_content(parents[0], parent_texts)

565

new_texts = content.text()

566

old_texts = reference_content.text()

567

delta_seq = patiencediff.PatienceSequenceMatcher(

568

None, old_texts, new_texts)

569

return self._make_line_delta(delta_seq, content)

570

571

def _make_line_delta(self, delta_seq, new_content):

572

"""Generate a line delta from delta_seq and new_content."""

573

diff_hunks = []

574

for op in delta_seq.get_opcodes():

575

if op[0] == 'equal':

576

continue

577

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

578

return diff_hunks

579

580

def _get_components_positions(self, version_ids):

581

"""Produce a map of position data for the components of versions.

582

583

This data is intended to be used for retrieving the knit records.

584

585

A dict of version_id to (method, data_pos, data_size, next) is

586

returned.

587

method is the way referenced data should be applied.

588

data_pos is the position of the data in the knit.

589

data_size is the size of the data in the knit.

590

next is the build-parent of the version, or None for fulltexts.

591

"""

592

component_data = {}

593

for version_id in version_ids:

594

cursor = version_id

595

596

while cursor is not None and cursor not in component_data:

597

method = self._index.get_method(cursor)

598

if method == 'fulltext':

599

next = None

600

else:

601

next = self.get_parents(cursor)[0]

602

data_pos, data_size = self._index.get_position(cursor)

603

component_data[cursor] = (method, data_pos, data_size, next)

604

cursor = next

605

return component_data

606

607

def _get_content(self, version_id, parent_texts={}):

608

"""Returns a content object that makes up the specified

609

version."""

610

if not self.has_version(version_id):

611

raise RevisionNotPresent(version_id, self.filename)

612

613

cached_version = parent_texts.get(version_id, None)

614

if cached_version is not None:

615

return cached_version

616

617

text_map, contents_map = self._get_content_maps([version_id])

618

return contents_map[version_id]

619

620

def _check_versions_present(self, version_ids):

621

"""Check that all specified versions are present."""

622

version_ids = set(version_ids)

623

for r in list(version_ids):

624

if self._index.has_version(r):

625

version_ids.remove(r)

626

if version_ids:

627

raise RevisionNotPresent(list(version_ids)[0], self.filename)

628

629

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

630

"""See VersionedFile.add_lines_with_ghosts()."""

631

self._check_add(version_id, lines)

632

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

633

634

def _add_lines(self, version_id, parents, lines, parent_texts):

635

"""See VersionedFile.add_lines."""

636

self._check_add(version_id, lines)

637

self._check_versions_present(parents)

638

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

639

640

def _check_add(self, version_id, lines):

641

"""check that version_id and lines are safe to add."""

642

assert self.writable, "knit is not opened for write"

643

### FIXME escape. RBC 20060228

644

if contains_whitespace(version_id):

645

raise InvalidRevisionId(version_id, self.filename)

646

if self.has_version(version_id):

647

raise RevisionAlreadyPresent(version_id, self.filename)

648

self._check_lines_not_unicode(lines)

649

self._check_lines_are_lines(lines)

650

651

def _add(self, version_id, lines, parents, delta, parent_texts):

652

"""Add a set of lines on top of version specified by parents.

653

654

If delta is true, compress the text as a line-delta against

655

the first parent.

656

657

Any versions not present will be converted into ghosts.

658

"""

659

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

660

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

661

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

662

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

663

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

664

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

665

# +1383 0 8.0370 8.0370 +<len>

666

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

667

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

668

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

669

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

670

671

present_parents = []

672

ghosts = []

673

if parent_texts is None:

674

parent_texts = {}

675

for parent in parents:

676

if not self.has_version(parent):

677

ghosts.append(parent)

678

else:

679

present_parents.append(parent)

680

681

if delta and not len(present_parents):

682

delta = False

683

684

digest = sha_strings(lines)

685

options = []

686

if lines:

687

if lines[-1][-1] != '\n':

688

options.append('no-eol')

689

lines[-1] = lines[-1] + '\n'

690

691

if len(present_parents) and delta:

692

# To speed the extract of texts the delta chain is limited

693

# to a fixed number of deltas. This should minimize both

694

# I/O and the time spend applying deltas.

695

delta = self._check_should_delta(present_parents)

696

697

lines = self.factory.make(lines, version_id)

698

if delta or (self.factory.annotated and len(present_parents) > 0):

699

# Merge annotations from parent texts if so is needed.

700

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

701

delta, self.factory.annotated)

702

703

if delta:

704

options.append('line-delta')

705

store_lines = self.factory.lower_line_delta(delta_hunks)

706

else:

707

options.append('fulltext')

708

store_lines = self.factory.lower_fulltext(lines)

709

710

where, size = self._data.add_record(version_id, digest, store_lines)

711

self._index.add_version(version_id, options, where, size, parents)

712

return lines

713

714

def check(self, progress_bar=None):

715

"""See VersionedFile.check()."""

716

717

def _clone_text(self, new_version_id, old_version_id, parents):

718

"""See VersionedFile.clone_text()."""

719

# FIXME RBC 20060228 make fast by only inserting an index with null

720

# delta.

721

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

722

723

def get_lines(self, version_id):

724

"""See VersionedFile.get_lines()."""

725

return self.get_line_list([version_id])[0]

726

727

def _get_record_map(self, version_ids):

728

"""Produce a dictionary of knit records.

729

730

The keys are version_ids, the values are tuples of (method, content,

731

digest, next).

732

method is the way the content should be applied.

733

content is a KnitContent object.

734

digest is the SHA1 digest of this version id after all steps are done

735

next is the build-parent of the version, i.e. the leftmost ancestor.

736

If the method is fulltext, next will be None.

737

"""

738

position_map = self._get_components_positions(version_ids)

739

# c = component_id, m = method, p = position, s = size, n = next

740

records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]

741

record_map = {}

742

for component_id, content, digest in \

743

self._data.read_records_iter(records):

744

method, position, size, next = position_map[component_id]

745

record_map[component_id] = method, content, digest, next

746

747

return record_map

748

749

def get_text(self, version_id):

750

"""See VersionedFile.get_text"""

751

return self.get_texts([version_id])[0]

752

753

def get_texts(self, version_ids):

754

return [''.join(l) for l in self.get_line_list(version_ids)]

755

756

def get_line_list(self, version_ids):

757

"""Return the texts of listed versions as a list of strings."""

758

text_map, content_map = self._get_content_maps(version_ids)

759

return [text_map[v] for v in version_ids]

760

761

def _get_content_maps(self, version_ids):

762

"""Produce maps of text and KnitContents

763

764

:return: (text_map, content_map) where text_map contains the texts for

765

the requested versions and content_map contains the KnitContents.

766

Both dicts take version_ids as their keys.

767

"""

768

for version_id in version_ids:

769

if not self.has_version(version_id):

770

raise RevisionNotPresent(version_id, self.filename)

771

record_map = self._get_record_map(version_ids)

772

773

text_map = {}

774

content_map = {}

775

final_content = {}

776

for version_id in version_ids:

777

components = []

778

cursor = version_id

779

while cursor is not None:

780

method, data, digest, next = record_map[cursor]

781

components.append((cursor, method, data, digest))

782

if cursor in content_map:

783

break

784

cursor = next

785

786

content = None

787

for component_id, method, data, digest in reversed(components):

788

if component_id in content_map:

789

content = content_map[component_id]

790

else:

791

version_idx = self._index.lookup(component_id)

792

if method == 'fulltext':

793

assert content is None

794

content = self.factory.parse_fulltext(data, version_idx)

795

elif method == 'line-delta':

796

delta = self.factory.parse_line_delta(data[:],

797

version_idx)

798

content = content.copy()

799

content._lines = self._apply_delta(content._lines,

800

delta)

801

content_map[component_id] = content

802

803

if 'no-eol' in self._index.get_options(version_id):

804

content = content.copy()

805

line = content._lines[-1][1].rstrip('\n')

806

content._lines[-1] = (content._lines[-1][0], line)

807

final_content[version_id] = content

808

809

# digest here is the digest from the last applied component.

810

text = content.text()

811

if sha_strings(text) != digest:

812

raise KnitCorrupt(self.filename,

813

'sha-1 does not match %s' % version_id)

814

815

text_map[version_id] = text

816

return text_map, final_content

817

818

def iter_lines_added_or_present_in_versions(self, version_ids=None,

819

pb=None):

820

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

821

if version_ids is None:

822

version_ids = self.versions()

823

if pb is None:

824

pb = progress.DummyProgress()

825

# we don't care about inclusions, the caller cares.

826

# but we need to setup a list of records to visit.

827

# we need version_id, position, length

828

version_id_records = []

829

requested_versions = list(version_ids)

830

# filter for available versions

831

for version_id in requested_versions:

832

if not self.has_version(version_id):

833

raise RevisionNotPresent(version_id, self.filename)

834

# get a in-component-order queue:

835

version_ids = []

836

for version_id in self.versions():

837

if version_id in requested_versions:

838

version_ids.append(version_id)

839

data_pos, length = self._index.get_position(version_id)

840

version_id_records.append((version_id, data_pos, length))

841

842

total = len(version_id_records)

843

for version_idx, (version_id, data, sha_value) in \

844

enumerate(self._data.read_records_iter(version_id_records)):

845

pb.update('Walking content.', version_idx, total)

846

method = self._index.get_method(version_id)

847

version_idx = self._index.lookup(version_id)

848

assert method in ('fulltext', 'line-delta')

849

if method == 'fulltext':

850

content = self.factory.parse_fulltext(data, version_idx)

851

for line in content.text():

852

yield line

853

else:

854

delta = self.factory.parse_line_delta(data, version_idx)

855

for start, end, count, lines in delta:

856

for origin, line in lines:

857

yield line

858

pb.update('Walking content.', total, total)

859

860

def num_versions(self):

861

"""See VersionedFile.num_versions()."""

862

return self._index.num_versions()

863

864

__len__ = num_versions

865

866

def annotate_iter(self, version_id):

867

"""See VersionedFile.annotate_iter."""

868

content = self._get_content(version_id)

869

for origin, text in content.annotate_iter():

870

yield origin, text

871

872

def get_parents(self, version_id):

873

"""See VersionedFile.get_parents."""

874

# perf notes:

875

# optimism counts!

876

# 52554 calls in 1264 872 internal down from 3674

877

try:

878

return self._index.get_parents(version_id)

879

except KeyError:

880

raise RevisionNotPresent(version_id, self.filename)

881

882

def get_parents_with_ghosts(self, version_id):

883

"""See VersionedFile.get_parents."""

884

try:

885

return self._index.get_parents_with_ghosts(version_id)

886

except KeyError:

887

raise RevisionNotPresent(version_id, self.filename)

888

889

def get_ancestry(self, versions):

890

"""See VersionedFile.get_ancestry."""

891

if isinstance(versions, basestring):

892

versions = [versions]

893

if not versions:

894

return []

895

self._check_versions_present(versions)

896

return self._index.get_ancestry(versions)

897

898

def get_ancestry_with_ghosts(self, versions):

899

"""See VersionedFile.get_ancestry_with_ghosts."""

900

if isinstance(versions, basestring):

901

versions = [versions]

902

if not versions:

903

return []

904

self._check_versions_present(versions)

905

return self._index.get_ancestry_with_ghosts(versions)

906

907

#@deprecated_method(zero_eight)

908

def walk(self, version_ids):

909

"""See VersionedFile.walk."""

910

# We take the short path here, and extract all relevant texts

911

# and put them in a weave and let that do all the work. Far

912

# from optimal, but is much simpler.

913

# FIXME RB 20060228 this really is inefficient!

914

from bzrlib.weave import Weave

915

916

w = Weave(self.filename)

917

ancestry = self.get_ancestry(version_ids)

918

sorted_graph = topo_sort(self._index.get_graph())

919

version_list = [vid for vid in sorted_graph if vid in ancestry]

920

921

for version_id in version_list:

922

lines = self.get_lines(version_id)

923

w.add_lines(version_id, self.get_parents(version_id), lines)

924

925

for lineno, insert_id, dset, line in w.walk(version_ids):

926

yield lineno, insert_id, dset, line

927

928

def plan_merge(self, ver_a, ver_b):

929

"""See VersionedFile.plan_merge."""

930

ancestors_b = set(self.get_ancestry(ver_b))

931

def status_a(revision, text):

932

if revision in ancestors_b:

933

return 'killed-b', text

934

else:

935

return 'new-a', text

936

937

ancestors_a = set(self.get_ancestry(ver_a))

938

def status_b(revision, text):

939

if revision in ancestors_a:

940

return 'killed-a', text

941

else:

942

return 'new-b', text

943

944

annotated_a = self.annotate(ver_a)

945

annotated_b = self.annotate(ver_b)

946

plain_a = [t for (a, t) in annotated_a]

947

plain_b = [t for (a, t) in annotated_b]

948

blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

949

a_cur = 0

950

b_cur = 0

951

for ai, bi, l in blocks:

952

# process all mismatched sections

953

# (last mismatched section is handled because blocks always

954

# includes a 0-length last block)

955

for revision, text in annotated_a[a_cur:ai]:

956

yield status_a(revision, text)

957

for revision, text in annotated_b[b_cur:bi]:

958

yield status_b(revision, text)

959

960

# and now the matched section

961

a_cur = ai + l

962

b_cur = bi + l

963

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

964

assert text_a == text_b

965

yield "unchanged", text_a

966

967

968

class _KnitComponentFile(object):

969

"""One of the files used to implement a knit database"""

970

971

def __init__(self, transport, filename, mode, file_mode=None,

972

create_parent_dir=False, dir_mode=None):

973

self._transport = transport

974

self._filename = filename

975

self._mode = mode

976

self._file_mode = file_mode

977

self._dir_mode = dir_mode

978

self._create_parent_dir = create_parent_dir

979

self._need_to_create = False

980

981

def check_header(self, fp):

982

line = fp.readline()

983

if line != self.HEADER:

984

raise KnitHeaderError(badline=line)

985

986

def commit(self):

987

"""Commit is a nop."""

988

989

def __repr__(self):

990

return '%s(%s)' % (self.__class__.__name__, self._filename)

991

992

993

class _KnitIndex(_KnitComponentFile):

994

"""Manages knit index file.

995

996

The index is already kept in memory and read on startup, to enable

997

fast lookups of revision information. The cursor of the index

998

file is always pointing to the end, making it easy to append

999

entries.

1000

1001

_cache is a cache for fast mapping from version id to a Index

1002

object.

1003

1004

_history is a cache for fast mapping from indexes to version ids.

1005

1006

The index data format is dictionary compressed when it comes to

1007

parent references; a index entry may only have parents that with a

1008

lover index number. As a result, the index is topological sorted.

1009

1010

Duplicate entries may be written to the index for a single version id

1011

if this is done then the latter one completely replaces the former:

1012

this allows updates to correct version and parent information.

1013

Note that the two entries may share the delta, and that successive

1014

annotations and references MUST point to the first entry.

1015

1016

The index file on disc contains a header, followed by one line per knit

1017

record. The same revision can be present in an index file more than once.

1018

The first occurrence gets assigned a sequence number starting from 0.

1019

1020

The format of a single line is

1021

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

1022

REVISION_ID is a utf8-encoded revision id

1023

FLAGS is a comma separated list of flags about the record. Values include

1024

no-eol, line-delta, fulltext.

1025

BYTE_OFFSET is the ascii representation of the byte offset in the data file

1026

that the the compressed data starts at.

1027

LENGTH is the ascii representation of the length of the data file.

1028

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

1029

REVISION_ID.

1030

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

1031

revision id already in the knit that is a parent of REVISION_ID.

1032

The ' :' marker is the end of record marker.

1033

1034

partial writes:

1035

when a write is interrupted to the index file, it will result in a line that

1036

does not end in ' :'. If the ' :' is not present at the end of a line, or at

1037

the end of the file, then the record that is missing it will be ignored by

1038

the parser.

1039

1040

When writing new records to the index file, the data is preceded by '\n'

1041

to ensure that records always start on new lines even if the last write was

1042

interrupted. As a result its normal for the last line in the index to be

1043

missing a trailing newline. One can be added with no harmful effects.

1044

"""

1045

1046

HEADER = "# bzr knit index 8\n"

1047

1048

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

1049

# __slots__ = ['_cache', '_history', '_transport', '_filename']

1050

1051

def _cache_version(self, version_id, options, pos, size, parents):

1052

"""Cache a version record in the history array and index cache.

1053

1054

This is inlined into __init__ for performance. KEEP IN SYNC.

1055

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1056

indexes).

1057

"""

1058

# only want the _history index to reference the 1st index entry

1059

# for version_id

1060

if version_id not in self._cache:

1061

index = len(self._history)

1062

self._history.append(version_id)

1063

else:

1064

index = self._cache[version_id][5]

1065

self._cache[version_id] = (version_id,

1066

options,

1067

pos,

1068

size,

1069

parents,

1070

index)

1071

1072

def __init__(self, transport, filename, mode, create=False, file_mode=None,

1073

create_parent_dir=False, delay_create=False, dir_mode=None):

1074

_KnitComponentFile.__init__(self, transport, filename, mode,

1075

file_mode=file_mode,

1076

create_parent_dir=create_parent_dir,

1077

dir_mode=dir_mode)

1078

self._cache = {}

1079

# position in _history is the 'official' index for a revision

1080

# but the values may have come from a newer entry.

1081

# so - wc -l of a knit index is != the number of unique names

1082

# in the knit.

1083

self._history = []

1084

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1085

try:

1086

count = 0

1087

total = 1

1088

try:

1089

pb.update('read knit index', count, total)

1090

fp = self._transport.get(self._filename)

1091

try:

1092

self.check_header(fp)

1093

# readlines reads the whole file at once:

1094

# bad for transports like http, good for local disk

1095

# we save 60 ms doing this one change (

1096

# from calling readline each time to calling

1097

# readlines once.

1098

# probably what we want for nice behaviour on

1099

# http is a incremental readlines that yields, or

1100

# a check for local vs non local indexes,

1101

for l in fp.readlines():

1102

rec = l.split()

1103

if len(rec) < 5 or rec[-1] != ':':

1104

# corrupt line.

1105

# FIXME: in the future we should determine if its a

1106

# short write - and ignore it

1107

# or a different failure, and raise. RBC 20060407

1108

continue

1109

count += 1

1110

total += 1

1111

#pb.update('read knit index', count, total)

1112

# See self._parse_parents

1113

parents = []

1114

for value in rec[4:-1]:

1115

if '.' == value[0]:

1116

# uncompressed reference

1117

parents.append(value[1:])

1118

else:

1119

# this is 15/4000ms faster than isinstance,

1120

# (in lsprof)

1121

# this function is called thousands of times a

1122

# second so small variations add up.

1123

assert value.__class__ is str

1124

parents.append(self._history[int(value)])

1125

# end self._parse_parents

1126

# self._cache_version(rec[0],

1127

# rec[1].split(','),

1128

# int(rec[2]),

1129

# int(rec[3]),

1130

# parents)

1131

# --- self._cache_version

1132

# only want the _history index to reference the 1st

1133

# index entry for version_id

1134

version_id = rec[0]

1135

if version_id not in self._cache:

1136

index = len(self._history)

1137

self._history.append(version_id)

1138

else:

1139

index = self._cache[version_id][5]

1140

self._cache[version_id] = (version_id,

1141

rec[1].split(','),

1142

int(rec[2]),

1143

int(rec[3]),

1144

parents,

1145

index)

1146

# --- self._cache_version

1147

finally:

1148

fp.close()

1149

except NoSuchFile, e:

1150

if mode != 'w' or not create:

1151

raise

1152

if delay_create:

1153

self._need_to_create = True

1154

else:

1155

self._transport.put_bytes_non_atomic(self._filename,

1156

self.HEADER, mode=self._file_mode)

1157

1158

finally:

1159

pb.update('read knit index', total, total)

1160

pb.finished()

1161

1162

def _parse_parents(self, compressed_parents):

1163

"""convert a list of string parent values into version ids.

1164

1165

ints are looked up in the index.

1166

.FOO values are ghosts and converted in to FOO.

1167

1168

NOTE: the function is retained here for clarity, and for possible

1169

use in partial index reads. However bulk processing now has

1170

it inlined in __init__ for inner-loop optimisation.

1171

"""

1172

result = []

1173

for value in compressed_parents:

1174

if value[-1] == '.':

1175

# uncompressed reference

1176

result.append(value[1:])

1177

else:

1178

# this is 15/4000ms faster than isinstance,

1179

# this function is called thousands of times a

1180

# second so small variations add up.

1181

assert value.__class__ is str

1182

result.append(self._history[int(value)])

1183

return result

1184

1185

def get_graph(self):

1186

graph = []

1187

for version_id, index in self._cache.iteritems():

1188

graph.append((version_id, index[4]))

1189

return graph

1190

1191

def get_ancestry(self, versions):

1192

"""See VersionedFile.get_ancestry."""

1193

# get a graph of all the mentioned versions:

1194

graph = {}

1195

pending = set(versions)

1196

while len(pending):

1197

version = pending.pop()

1198

parents = self._cache[version][4]

1199

# got the parents ok

1200

# trim ghosts

1201

parents = [parent for parent in parents if parent in self._cache]

1202

for parent in parents:

1203

# if not completed and not a ghost

1204

if parent not in graph:

1205

pending.add(parent)

1206

graph[version] = parents

1207

return topo_sort(graph.items())

1208

1209

def get_ancestry_with_ghosts(self, versions):

1210

"""See VersionedFile.get_ancestry_with_ghosts."""

1211

# get a graph of all the mentioned versions:

1212

graph = {}

1213

pending = set(versions)

1214

while len(pending):

1215

version = pending.pop()

1216

try:

1217

parents = self._cache[version][4]

1218

except KeyError:

1219

# ghost, fake it

1220

graph[version] = []

1221

pass

1222

else:

1223

# got the parents ok

1224

for parent in parents:

1225

if parent not in graph:

1226

pending.add(parent)

1227

graph[version] = parents

1228

return topo_sort(graph.items())

1229

1230

def num_versions(self):

1231

return len(self._history)

1232

1233

__len__ = num_versions

1234

1235

def get_versions(self):

1236

return self._history

1237

1238

def idx_to_name(self, idx):

1239

return self._history[idx]

1240

1241

def lookup(self, version_id):

1242

assert version_id in self._cache

1243

return self._cache[version_id][5]

1244

1245

def _version_list_to_index(self, versions):

1246

encode_utf8 = cache_utf8.encode

1247

result_list = []

1248

for version in versions:

1249

if version in self._cache:

1250

# -- inlined lookup() --

1251

result_list.append(str(self._cache[version][5]))

1252

# -- end lookup () --

1253

else:

1254

result_list.append('.' + encode_utf8(version))

1255

return ' '.join(result_list)

1256

1257

def add_version(self, version_id, options, pos, size, parents):

1258

"""Add a version record to the index."""

1259

self.add_versions(((version_id, options, pos, size, parents),))

1260

1261

def add_versions(self, versions):

1262

"""Add multiple versions to the index.

1263

1264

:param versions: a list of tuples:

1265

(version_id, options, pos, size, parents).

1266

"""

1267

lines = []

1268

encode_utf8 = cache_utf8.encode

1269

orig_history = self._history[:]

1270

orig_cache = self._cache.copy()

1271

1272

try:

1273

for version_id, options, pos, size, parents in versions:

1274

line = "\n%s %s %s %s %s :" % (encode_utf8(version_id),

1275

','.join(options),

1276

pos,

1277

size,

1278

self._version_list_to_index(parents))

1279

assert isinstance(line, str), \

1280

'content must be utf-8 encoded: %r' % (line,)

1281

lines.append(line)

1282

self._cache_version(version_id, options, pos, size, parents)

1283

if not self._need_to_create:

1284

self._transport.append_bytes(self._filename, ''.join(lines))

1285

else:

1286

sio = StringIO()

1287

sio.write(self.HEADER)

1288

sio.writelines(lines)

1289

sio.seek(0)

1290

self._transport.put_file_non_atomic(self._filename, sio,

1291

create_parent_dir=self._create_parent_dir,

1292

mode=self._file_mode,

1293

dir_mode=self._dir_mode)

1294

self._need_to_create = False

1295

except:

1296

# If any problems happen, restore the original values and re-raise

1297

self._history = orig_history

1298

self._cache = orig_cache

1299

raise

1300

1301

def has_version(self, version_id):

1302

"""True if the version is in the index."""

1303

return (version_id in self._cache)

1304

1305

def get_position(self, version_id):

1306

"""Return data position and size of specified version."""

1307

return (self._cache[version_id][2], \

1308

self._cache[version_id][3])

1309

1310

def get_method(self, version_id):

1311

"""Return compression method of specified version."""

1312

options = self._cache[version_id][1]

1313

if 'fulltext' in options:

1314

return 'fulltext'

1315

else:

1316

assert 'line-delta' in options

1317

return 'line-delta'

1318

1319

def get_options(self, version_id):

1320

return self._cache[version_id][1]

1321

1322

def get_parents(self, version_id):

1323

"""Return parents of specified version ignoring ghosts."""

1324

return [parent for parent in self._cache[version_id][4]

1325

if parent in self._cache]

1326

1327

def get_parents_with_ghosts(self, version_id):

1328

"""Return parents of specified version with ghosts."""

1329

return self._cache[version_id][4]

1330

1331

def check_versions_present(self, version_ids):

1332

"""Check that all specified versions are present."""

1333

version_ids = set(version_ids)

1334

for version_id in list(version_ids):

1335

if version_id in self._cache:

1336

version_ids.remove(version_id)

1337

if version_ids:

1338

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1339

1340

1341

class _KnitData(_KnitComponentFile):

1342

"""Contents of the knit data file"""

1343

1344

def __init__(self, transport, filename, mode, create=False, file_mode=None,

1345

create_parent_dir=False, delay_create=False,

1346

dir_mode=None):

1347

_KnitComponentFile.__init__(self, transport, filename, mode,

1348

file_mode=file_mode,

1349

create_parent_dir=create_parent_dir,

1350

dir_mode=dir_mode)

1351

self._checked = False

1352

# TODO: jam 20060713 conceptually, this could spill to disk

1353

# if the cached size gets larger than a certain amount

1354

# but it complicates the model a bit, so for now just use

1355

# a simple dictionary

1356

self._cache = {}

1357

self._do_cache = False

1358

if create:

1359

if delay_create:

1360

self._need_to_create = create

1361

else:

1362

self._transport.put_bytes_non_atomic(self._filename, '',

1363

mode=self._file_mode)

1364

1365

def enable_cache(self):

1366

"""Enable caching of reads."""

1367

self._do_cache = True

1368

1369

def clear_cache(self):

1370

"""Clear the record cache."""

1371

self._do_cache = False

1372

self._cache = {}

1373

1374

def _open_file(self):

1375

try:

1376

return self._transport.get(self._filename)

1377

except NoSuchFile:

1378

pass

1379

return None

1380

1381

def _record_to_data(self, version_id, digest, lines):

1382

"""Convert version_id, digest, lines into a raw data block.

1383

1384

:return: (len, a StringIO instance with the raw data ready to read.)

1385

"""

1386

sio = StringIO()

1387

data_file = GzipFile(None, mode='wb', fileobj=sio)

1388

1389

version_id_utf8 = cache_utf8.encode(version_id)

1390

data_file.writelines(chain(

1391

["version %s %d %s\n" % (version_id_utf8,

1392

len(lines),

1393

digest)],

1394

lines,

1395

["end %s\n" % version_id_utf8]))

1396

data_file.close()

1397

length= sio.tell()

1398

1399

sio.seek(0)

1400

return length, sio

1401

1402

def add_raw_record(self, raw_data):

1403

"""Append a prepared record to the data file.

1404

1405

:return: the offset in the data file raw_data was written.

1406

"""

1407

assert isinstance(raw_data, str), 'data must be plain bytes'

1408

if not self._need_to_create:

1409

return self._transport.append_bytes(self._filename, raw_data)

1410

else:

1411

self._transport.put_bytes_non_atomic(self._filename, raw_data,

1412

create_parent_dir=self._create_parent_dir,

1413

mode=self._file_mode,

1414

dir_mode=self._dir_mode)

1415

self._need_to_create = False

1416

return 0

1417

1418

def add_record(self, version_id, digest, lines):

1419

"""Write new text record to disk. Returns the position in the

1420

file where it was written."""

1421

size, sio = self._record_to_data(version_id, digest, lines)

1422

# write to disk

1423

if not self._need_to_create:

1424

start_pos = self._transport.append_file(self._filename, sio)

1425

else:

1426

self._transport.put_file_non_atomic(self._filename, sio,

1427

create_parent_dir=self._create_parent_dir,

1428

mode=self._file_mode,

1429

dir_mode=self._dir_mode)

1430

self._need_to_create = False

1431

start_pos = 0

1432

if self._do_cache:

1433

self._cache[version_id] = sio.getvalue()

1434

return start_pos, size

1435

1436

def _parse_record_header(self, version_id, raw_data):

1437

"""Parse a record header for consistency.

1438

1439

:return: the header and the decompressor stream.

1440

as (stream, header_record)

1441

"""

1442

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1443

rec = df.readline().split()

1444

if len(rec) != 4:

1445

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1446

if cache_utf8.decode(rec[1]) != version_id:

1447

raise KnitCorrupt(self._filename,

1448

'unexpected version, wanted %r, got %r' % (

1449

version_id, rec[1]))

1450

return df, rec

1451

1452

def _parse_record(self, version_id, data):

1453

# profiling notes:

1454

# 4168 calls in 2880 217 internal

1455

# 4168 calls to _parse_record_header in 2121

1456

# 4168 calls to readlines in 330

1457

df, rec = self._parse_record_header(version_id, data)

1458

record_contents = df.readlines()

1459

l = record_contents.pop()

1460

assert len(record_contents) == int(rec[2])

1461

if l != 'end %s\n' % cache_utf8.encode(version_id):

1462

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1463

% (l, version_id))

1464

df.close()

1465

return record_contents, rec[3]

1466

1467

def read_records_iter_raw(self, records):

1468

"""Read text records from data file and yield raw data.

1469

1470

This unpacks enough of the text record to validate the id is

1471

as expected but thats all.

1472

"""

1473

# setup an iterator of the external records:

1474

# uses readv so nice and fast we hope.

1475

if len(records):

1476

# grab the disk data needed.

1477

if self._cache:

1478

# Don't check _cache if it is empty

1479

needed_offsets = [(pos, size) for version_id, pos, size

1480

in records

1481

if version_id not in self._cache]

1482

else:

1483

needed_offsets = [(pos, size) for version_id, pos, size

1484

in records]

1485

1486

raw_records = self._transport.readv(self._filename, needed_offsets)

1487

1488

1489

for version_id, pos, size in records:

1490

if version_id in self._cache:

1491

# This data has already been validated

1492

data = self._cache[version_id]

1493

else:

1494

pos, data = raw_records.next()

1495

if self._do_cache:

1496

self._cache[version_id] = data

1497

1498

# validate the header

1499

df, rec = self._parse_record_header(version_id, data)

1500

df.close()

1501

yield version_id, data

1502

1503

def read_records_iter(self, records):

1504

"""Read text records from data file and yield result.

1505

1506

The result will be returned in whatever is the fastest to read.

1507

Not by the order requested. Also, multiple requests for the same

1508

record will only yield 1 response.

1509

:param records: A list of (version_id, pos, len) entries

1510

:return: Yields (version_id, contents, digest) in the order

1511

read, not the order requested

1512

"""

1513

if not records:

1514

return

1515

1516

if self._cache:

1517

# Skip records we have alread seen

1518

yielded_records = set()

1519

needed_records = set()

1520

for record in records:

1521

if record[0] in self._cache:

1522

if record[0] in yielded_records:

1523

continue

1524

yielded_records.add(record[0])

1525

data = self._cache[record[0]]

1526

content, digest = self._parse_record(record[0], data)

1527

yield (record[0], content, digest)

1528

else:

1529

needed_records.add(record)

1530

needed_records = sorted(needed_records, key=operator.itemgetter(1))

1531

else:

1532

needed_records = sorted(set(records), key=operator.itemgetter(1))

1533

1534

if not needed_records:

1535

return

1536

1537

# The transport optimizes the fetching as well

1538

# (ie, reads continuous ranges.)

1539

readv_response = self._transport.readv(self._filename,

1540

[(pos, size) for version_id, pos, size in needed_records])

1541

1542

for (version_id, pos, size), (pos, data) in \

1543

izip(iter(needed_records), readv_response):

1544

content, digest = self._parse_record(version_id, data)

1545

if self._do_cache:

1546

self._cache[version_id] = data

1547

yield version_id, content, digest

1548

1549

def read_records(self, records):

1550

"""Read records into a dictionary."""

1551

components = {}

1552

for record_id, content, digest in \

1553

self.read_records_iter(records):

1554

components[record_id] = (content, digest)

1555

return components

1556

1557

1558

class InterKnit(InterVersionedFile):

1559

"""Optimised code paths for knit to knit operations."""

1560

1561

_matching_file_from_factory = KnitVersionedFile

1562

_matching_file_to_factory = KnitVersionedFile

1563

1564

@staticmethod

1565

def is_compatible(source, target):

1566

"""Be compatible with knits. """

1567

try:

1568

return (isinstance(source, KnitVersionedFile) and

1569

isinstance(target, KnitVersionedFile))

1570

except AttributeError:

1571

return False

1572

1573

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1574

"""See InterVersionedFile.join."""

1575

assert isinstance(self.source, KnitVersionedFile)

1576

assert isinstance(self.target, KnitVersionedFile)

1577

1578

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1579

1580

if not version_ids:

1581

return 0

1582

1583

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1584

try:

1585

version_ids = list(version_ids)

1586

if None in version_ids:

1587

version_ids.remove(None)

1588

1589

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1590

this_versions = set(self.target._index.get_versions())

1591

needed_versions = self.source_ancestry - this_versions

1592

cross_check_versions = self.source_ancestry.intersection(this_versions)

1593

mismatched_versions = set()

1594

for version in cross_check_versions:

1595

# scan to include needed parents.

1596

n1 = set(self.target.get_parents_with_ghosts(version))

1597

n2 = set(self.source.get_parents_with_ghosts(version))

1598

if n1 != n2:

1599

# FIXME TEST this check for cycles being introduced works

1600

# the logic is we have a cycle if in our graph we are an

1601

# ancestor of any of the n2 revisions.

1602

for parent in n2:

1603

if parent in n1:

1604

# safe

1605

continue

1606

else:

1607

parent_ancestors = self.source.get_ancestry(parent)

1608

if version in parent_ancestors:

1609

raise errors.GraphCycleError([parent, version])

1610

# ensure this parent will be available later.

1611

new_parents = n2.difference(n1)

1612

needed_versions.update(new_parents.difference(this_versions))

1613

mismatched_versions.add(version)

1614

1615

if not needed_versions and not mismatched_versions:

1616

return 0

1617

full_list = topo_sort(self.source.get_graph())

1618

1619

version_list = [i for i in full_list if (not self.target.has_version(i)

1620

and i in needed_versions)]

1621

1622

# plan the join:

1623

copy_queue = []

1624

copy_queue_records = []

1625

copy_set = set()

1626

for version_id in version_list:

1627

options = self.source._index.get_options(version_id)

1628

parents = self.source._index.get_parents_with_ghosts(version_id)

1629

# check that its will be a consistent copy:

1630

for parent in parents:

1631

# if source has the parent, we must :

1632

# * already have it or

1633

# * have it scheduled already

1634

# otherwise we don't care

1635

assert (self.target.has_version(parent) or

1636

parent in copy_set or

1637

not self.source.has_version(parent))

1638

data_pos, data_size = self.source._index.get_position(version_id)

1639

copy_queue_records.append((version_id, data_pos, data_size))

1640

copy_queue.append((version_id, options, parents))

1641

copy_set.add(version_id)

1642

1643

# data suck the join:

1644

count = 0

1645

total = len(version_list)

1646

raw_datum = []

1647

raw_records = []

1648

for (version_id, raw_data), \

1649

(version_id2, options, parents) in \

1650

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1651

copy_queue):

1652

assert version_id == version_id2, 'logic error, inconsistent results'

1653

count = count + 1

1654

pb.update("Joining knit", count, total)

1655

raw_records.append((version_id, options, parents, len(raw_data)))

1656

raw_datum.append(raw_data)

1657

self.target._add_raw_records(raw_records, ''.join(raw_datum))

1658

1659

for version in mismatched_versions:

1660

# FIXME RBC 20060309 is this needed?

1661

n1 = set(self.target.get_parents_with_ghosts(version))

1662

n2 = set(self.source.get_parents_with_ghosts(version))

1663

# write a combined record to our history preserving the current

1664

# parents as first in the list

1665

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1666

self.target.fix_parents(version, new_parents)

1667

return count

1668

finally:

1669

pb.finished()

1670

1671

1672

InterVersionedFile.register_optimiser(InterKnit)

1673

1674

1675

class WeaveToKnit(InterVersionedFile):

1676

"""Optimised code paths for weave to knit operations."""

1677

1678

_matching_file_from_factory = bzrlib.weave.WeaveFile

1679

_matching_file_to_factory = KnitVersionedFile

1680

1681

@staticmethod

1682

def is_compatible(source, target):

1683

"""Be compatible with weaves to knits."""

1684

try:

1685

return (isinstance(source, bzrlib.weave.Weave) and

1686

isinstance(target, KnitVersionedFile))

1687

except AttributeError:

1688

return False

1689

1690

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1691

"""See InterVersionedFile.join."""

1692

assert isinstance(self.source, bzrlib.weave.Weave)

1693

assert isinstance(self.target, KnitVersionedFile)

1694

1695

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1696

1697

if not version_ids:

1698

return 0

1699

1700

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1701

try:

1702

version_ids = list(version_ids)

1703

1704

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1705

this_versions = set(self.target._index.get_versions())

1706

needed_versions = self.source_ancestry - this_versions

1707

cross_check_versions = self.source_ancestry.intersection(this_versions)

1708

mismatched_versions = set()

1709

for version in cross_check_versions:

1710

# scan to include needed parents.

1711

n1 = set(self.target.get_parents_with_ghosts(version))

1712

n2 = set(self.source.get_parents(version))

1713

# if all of n2's parents are in n1, then its fine.

1714

if n2.difference(n1):

1715

# FIXME TEST this check for cycles being introduced works

1716

# the logic is we have a cycle if in our graph we are an

1717

# ancestor of any of the n2 revisions.

1718

for parent in n2:

1719

if parent in n1:

1720

# safe

1721

continue

1722

else:

1723

parent_ancestors = self.source.get_ancestry(parent)

1724

if version in parent_ancestors:

1725

raise errors.GraphCycleError([parent, version])

1726

# ensure this parent will be available later.

1727

new_parents = n2.difference(n1)

1728

needed_versions.update(new_parents.difference(this_versions))

1729

mismatched_versions.add(version)

1730

1731

if not needed_versions and not mismatched_versions:

1732

return 0

1733

full_list = topo_sort(self.source.get_graph())

1734

1735

version_list = [i for i in full_list if (not self.target.has_version(i)

1736

and i in needed_versions)]

1737

1738

# do the join:

1739

count = 0

1740

total = len(version_list)

1741

for version_id in version_list:

1742

pb.update("Converting to knit", count, total)

1743

parents = self.source.get_parents(version_id)

1744

# check that its will be a consistent copy:

1745

for parent in parents:

1746

# if source has the parent, we must already have it

1747

assert (self.target.has_version(parent))

1748

self.target.add_lines(

1749

version_id, parents, self.source.get_lines(version_id))

1750

count = count + 1

1751

1752

for version in mismatched_versions:

1753

# FIXME RBC 20060309 is this needed?

1754

n1 = set(self.target.get_parents_with_ghosts(version))

1755

n2 = set(self.source.get_parents(version))

1756

# write a combined record to our history preserving the current

1757

# parents as first in the list

1758

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1759

self.target.fix_parents(version, new_parents)

1760

return count

1761

finally:

1762

pb.finished()

1763

1764

1765

InterVersionedFile.register_optimiser(WeaveToKnit)

1766

1767

1768

class KnitSequenceMatcher(difflib.SequenceMatcher):

1769

"""Knit tuned sequence matcher.

1770

1771

This is based on profiling of difflib which indicated some improvements

1772

for our usage pattern.

1773

"""

1774

1775

def find_longest_match(self, alo, ahi, blo, bhi):

1776

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1777

1778

If isjunk is not defined:

1779

1780

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1781

alo <= i <= i+k <= ahi

1782

blo <= j <= j+k <= bhi

1783

and for all (i',j',k') meeting those conditions,

1784

k >= k'

1785

i <= i'

1786

and if i == i', j <= j'

1787

1788

In other words, of all maximal matching blocks, return one that

1789

starts earliest in a, and of all those maximal matching blocks that

1790

start earliest in a, return the one that starts earliest in b.

1791

1792

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1793

>>> s.find_longest_match(0, 5, 0, 9)

1794

(0, 4, 5)

1795

1796

If isjunk is defined, first the longest matching block is

1797

determined as above, but with the additional restriction that no

1798

junk element appears in the block. Then that block is extended as

1799

far as possible by matching (only) junk elements on both sides. So

1800

the resulting block never matches on junk except as identical junk

1801

happens to be adjacent to an "interesting" match.

1802

1803

Here's the same example as before, but considering blanks to be

1804

junk. That prevents " abcd" from matching the " abcd" at the tail

1805

end of the second sequence directly. Instead only the "abcd" can

1806

match, and matches the leftmost "abcd" in the second sequence:

1807

1808

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1809

>>> s.find_longest_match(0, 5, 0, 9)

1810

(1, 0, 4)

1811

1812

If no blocks match, return (alo, blo, 0).

1813

1814

>>> s = SequenceMatcher(None, "ab", "c")

1815

>>> s.find_longest_match(0, 2, 0, 1)

1816

(0, 0, 0)

1817

"""

1818

1819

# CAUTION: stripping common prefix or suffix would be incorrect.

1820

# E.g.,

1821

# ab

1822

# acab

1823

# Longest matching block is "ab", but if common prefix is

1824

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1825

# strip, so ends up claiming that ab is changed to acab by

1826

# inserting "ca" in the middle. That's minimal but unintuitive:

1827

# "it's obvious" that someone inserted "ac" at the front.

1828

# Windiff ends up at the same place as diff, but by pairing up

1829

# the unique 'b's and then matching the first two 'a's.

1830

1831

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1832

besti, bestj, bestsize = alo, blo, 0

1833

# find longest junk-free match

1834

# during an iteration of the loop, j2len[j] = length of longest

1835

# junk-free match ending with a[i-1] and b[j]

1836

j2len = {}

1837

# nothing = []

1838

b2jget = b2j.get

1839

for i in xrange(alo, ahi):

1840

# look at all instances of a[i] in b; note that because

1841

# b2j has no junk keys, the loop is skipped if a[i] is junk

1842

j2lenget = j2len.get

1843

newj2len = {}

1844

1845

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

1846

# following improvement

1847

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1848

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1849

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1850

# to

1851

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1852

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1853

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1854

1855

try:

1856

js = b2j[a[i]]

1857

except KeyError:

1858

pass

1859

else:

1860

for j in js:

1861

# a[i] matches b[j]

1862

if j >= blo:

1863

if j >= bhi:

1864

break

1865

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1866

if k > bestsize:

1867

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1868

j2len = newj2len

1869

1870

# Extend the best by non-junk elements on each end. In particular,

1871

# "popular" non-junk elements aren't in b2j, which greatly speeds

1872

# the inner loop above, but also means "the best" match so far

1873

# doesn't contain any junk *or* popular non-junk elements.

1874

while besti > alo and bestj > blo and \

1875

not isbjunk(b[bestj-1]) and \

1876

a[besti-1] == b[bestj-1]:

1877

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1878

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1879

not isbjunk(b[bestj+bestsize]) and \

1880

a[besti+bestsize] == b[bestj+bestsize]:

1881

bestsize += 1

1882

1883

# Now that we have a wholly interesting match (albeit possibly

1884

# empty!), we may as well suck up the matching junk on each

1885

# side of it too. Can't think of a good reason not to, and it

1886

# saves post-processing the (possibly considerable) expense of

1887

# figuring out what to do with it. In the case of an empty

1888

# interesting match, this is clearly the right thing to do,

1889

# because no other kind of match is possible in the regions.

1890

while besti > alo and bestj > blo and \

1891

isbjunk(b[bestj-1]) and \

1892

a[besti-1] == b[bestj-1]:

1893

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1894

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1895

isbjunk(b[bestj+bestsize]) and \

1896

a[besti+bestsize] == b[bestj+bestsize]:

1897

bestsize = bestsize + 1

1898

1899

return besti, bestj, bestsize

1900

Older »