/brz/remove-bazaar : revision 1852.7.5

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-07-29 01:20:55 UTC
mfrom: (1852.6.10 tree-implementation tests.)
mto: (1852.8.9 InterTree)
mto: This revision was merged to the branch mainline in revision 1891.
Revision ID: robertc@robertcollins.net-20060729012055-720404af16df3bff

Merge bzr.dev.

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle

bzrlib/bundle/__init__.py

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/bundle_data.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v08.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/ignores.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/revisiontree.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_bundle.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_exceptions.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignore.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_ls.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_mv.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remerge.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_history.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/blackbox/test_whoami.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_locking.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/lock_helpers.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_delta.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_http_response.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_ignores.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/tree_implementations

bzrlib/tests/tree_implementations/__init__.py

bzrlib/tests/tree_implementations/test_test_trees.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_locking.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textfile.py

bzrlib/textinv.py

bzrlib/textmerge.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_pycurl_errors.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/http/response.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/urlutils.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/README.1st

doc/configuration.txt

doc/plugins.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/tutorial.txt

doc/using_aliases.txt

generate_docs.py

profile_imports.py

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/doc_generate/autodoc_rstx.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

files removed:
.bzrignore

COPYING

INSTALL

Makefile

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

commands.py

converter.py

dir.py

errors.py

fetch.py

foreign

foreign/.bzrignore

foreign/TODO

foreign/__init__.py

foreign/test_versionedfiles.py

foreign/upgrade.py

foreign/versionedfiles.py

mapping.py

notes

notes/roundtripping.txt

remote.py

repository.py

revspec.py

server.py

setup.py

shamap.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_dir.py

tests/test_fetch.py

tests/test_ids.py

tests/test_repository.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

import sys

import warnings

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import GzipFile

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.symbol_versioning import DEPRECATED_PARAMETER, deprecated_passed

from bzrlib.tsort import topo_sort

import bzrlib.weave

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accommodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

100

# TODO: atomically append data, then measure backwards from the cursor

101

# position after writing to work out where it was located. we may need to

102

# bypass python file buffering.

103

104

DATA_SUFFIX = '.knit'

105

INDEX_SUFFIX = '.kndx'

106

107

108

class KnitContent(object):

109

"""Content of a knit version to which deltas can be applied."""

110

111

def __init__(self, lines):

112

self._lines = lines

113

114

def annotate_iter(self):

115

"""Yield tuples of (origin, text) for each content line."""

116

for origin, text in self._lines:

117

yield origin, text

118

119

def annotate(self):

120

"""Return a list of (origin, text) tuples."""

121

return list(self.annotate_iter())

122

123

def line_delta_iter(self, new_lines):

124

"""Generate line-based delta from this content to new_lines."""

125

new_texts = [text for origin, text in new_lines._lines]

126

old_texts = [text for origin, text in self._lines]

127

s = KnitSequenceMatcher(None, old_texts, new_texts)

128

for op in s.get_opcodes():

129

if op[0] == 'equal':

130

continue

131

# ofrom oto length data

132

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

133

134

def line_delta(self, new_lines):

135

return list(self.line_delta_iter(new_lines))

136

137

def text(self):

138

return [text for origin, text in self._lines]

139

140

def copy(self):

141

return KnitContent(self._lines[:])

142

143

144

class _KnitFactory(object):

145

"""Base factory for creating content objects."""

146

147

def make(self, lines, version):

148

num_lines = len(lines)

149

return KnitContent(zip([version] * num_lines, lines))

150

151

152

class KnitAnnotateFactory(_KnitFactory):

153

"""Factory for creating annotated Content objects."""

154

155

annotated = True

156

157

def parse_fulltext(self, content, version):

158

"""Convert fulltext to internal representation

159

160

fulltext content is of the format

161

revid(utf8) plaintext\n

162

internal representation is of the format:

163

(revid, plaintext)

164

"""

165

lines = []

166

for line in content:

167

origin, text = line.split(' ', 1)

168

lines.append((origin.decode('utf-8'), text))

169

return KnitContent(lines)

170

171

def parse_line_delta_iter(self, lines):

172

for result_item in self.parse_line_delta[lines]:

173

yield result_item

174

175

def parse_line_delta(self, lines, version):

176

"""Convert a line based delta into internal representation.

177

178

line delta is in the form of:

179

intstart intend intcount

180

1..count lines:

181

revid(utf8) newline\n

182

internal representation is

183

(start, end, count, [1..count tuples (revid, newline)])

184

"""

185

result = []

186

lines = iter(lines)

187

next = lines.next

188

# walk through the lines parsing.

189

for header in lines:

190

start, end, count = [int(n) for n in header.split(',')]

191

contents = []

192

remaining = count

193

while remaining:

194

origin, text = next().split(' ', 1)

195

remaining -= 1

196

contents.append((origin.decode('utf-8'), text))

197

result.append((start, end, count, contents))

198

return result

199

200

def lower_fulltext(self, content):

201

"""convert a fulltext content record into a serializable form.

202

203

see parse_fulltext which this inverts.

204

"""

205

return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]

206

207

def lower_line_delta(self, delta):

208

"""convert a delta into a serializable form.

209

210

See parse_line_delta which this inverts.

211

"""

212

out = []

213

for start, end, c, lines in delta:

214

out.append('%d,%d,%d\n' % (start, end, c))

215

for origin, text in lines:

216

out.append('%s %s' % (origin.encode('utf-8'), text))

217

return out

218

219

220

class KnitPlainFactory(_KnitFactory):

221

"""Factory for creating plain Content objects."""

222

223

annotated = False

224

225

def parse_fulltext(self, content, version):

226

"""This parses an unannotated fulltext.

227

228

Note that this is not a noop - the internal representation

229

has (versionid, line) - its just a constant versionid.

230

"""

231

return self.make(content, version)

232

233

def parse_line_delta_iter(self, lines, version):

234

while lines:

235

header = lines.pop(0)

236

start, end, c = [int(n) for n in header.split(',')]

237

yield start, end, c, zip([version] * c, lines[:c])

238

del lines[:c]

239

240

def parse_line_delta(self, lines, version):

241

return list(self.parse_line_delta_iter(lines, version))

242

243

def lower_fulltext(self, content):

244

return content.text()

245

246

def lower_line_delta(self, delta):

247

out = []

248

for start, end, c, lines in delta:

249

out.append('%d,%d,%d\n' % (start, end, c))

250

out.extend([text for origin, text in lines])

251

return out

252

253

254

def make_empty_knit(transport, relpath):

255

"""Construct a empty knit at the specified location."""

256

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

257

k._data._open_file()

258

259

260

class KnitVersionedFile(VersionedFile):

261

"""Weave-like structure with faster random access.

262

263

A knit stores a number of texts and a summary of the relationships

264

between them. Texts are identified by a string version-id. Texts

265

are normally stored and retrieved as a series of lines, but can

266

also be passed as single strings.

267

268

Lines are stored with the trailing newline (if any) included, to

269

avoid special cases for files with no final newline. Lines are

270

composed of 8-bit characters, not unicode. The combination of

271

these approaches should mean any 'binary' file can be safely

272

stored and retrieved.

273

"""

274

275

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

276

factory=None, basis_knit=DEPRECATED_PARAMETER, delta=True,

277

create=False):

278

"""Construct a knit at location specified by relpath.

279

280

:param create: If not True, only open an existing knit.

281

"""

282

if deprecated_passed(basis_knit):

283

warnings.warn("KnitVersionedFile.__(): The basis_knit parameter is"

284

" deprecated as of bzr 0.9.",

285

DeprecationWarning, stacklevel=2)

286

if access_mode is None:

287

access_mode = 'w'

288

super(KnitVersionedFile, self).__init__(access_mode)

289

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

290

self.transport = transport

291

self.filename = relpath

292

self.factory = factory or KnitAnnotateFactory()

293

self.writable = (access_mode == 'w')

294

self.delta = delta

295

296

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

297

access_mode, create=create, file_mode=file_mode)

298

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

299

access_mode, create=create and not len(self), file_mode=file_mode)

300

301

def __repr__(self):

302

return '%s(%s)' % (self.__class__.__name__,

303

self.transport.abspath(self.filename))

304

305

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

306

"""See VersionedFile._add_delta()."""

307

self._check_add(version_id, []) # should we check the lines ?

308

self._check_versions_present(parents)

309

present_parents = []

310

ghosts = []

311

parent_texts = {}

312

for parent in parents:

313

if not self.has_version(parent):

314

ghosts.append(parent)

315

else:

316

present_parents.append(parent)

317

318

if delta_parent is None:

319

# reconstitute as full text.

320

assert len(delta) == 1 or len(delta) == 0

321

if len(delta):

322

assert delta[0][0] == 0

323

assert delta[0][1] == 0, delta[0][1]

324

return super(KnitVersionedFile, self)._add_delta(version_id,

325

parents,

326

delta_parent,

327

sha1,

328

noeol,

329

delta)

330

331

digest = sha1

332

333

options = []

334

if noeol:

335

options.append('no-eol')

336

337

if delta_parent is not None:

338

# determine the current delta chain length.

339

# To speed the extract of texts the delta chain is limited

340

# to a fixed number of deltas. This should minimize both

341

# I/O and the time spend applying deltas.

342

count = 0

343

delta_parents = [delta_parent]

344

while count < 25:

345

parent = delta_parents[0]

346

method = self._index.get_method(parent)

347

if method == 'fulltext':

348

break

349

delta_parents = self._index.get_parents(parent)

350

count = count + 1

351

if method == 'line-delta':

352

# did not find a fulltext in the delta limit.

353

# just do a normal insertion.

354

return super(KnitVersionedFile, self)._add_delta(version_id,

355

parents,

356

delta_parent,

357

sha1,

358

noeol,

359

delta)

360

361

options.append('line-delta')

362

store_lines = self.factory.lower_line_delta(delta)

363

364

where, size = self._data.add_record(version_id, digest, store_lines)

365

self._index.add_version(version_id, options, where, size, parents)

366

367

def _add_raw_records(self, records, data):

368

"""Add all the records 'records' with data pre-joined in 'data'.

369

370

:param records: A list of tuples(version_id, options, parents, size).

371

:param data: The data for the records. When it is written, the records

372

are adjusted to have pos pointing into data by the sum of

373

the preceding records sizes.

374

"""

375

# write all the data

376

pos = self._data.add_raw_record(data)

377

offset = 0

378

index_entries = []

379

for (version_id, options, parents, size) in records:

380

index_entries.append((version_id, options, pos+offset,

381

size, parents))

382

if self._data._do_cache:

383

self._data._cache[version_id] = data[offset:offset+size]

384

offset += size

385

self._index.add_versions(index_entries)

386

387

def enable_cache(self):

388

"""Start caching data for this knit"""

389

self._data.enable_cache()

390

391

def clear_cache(self):

392

"""Clear the data cache only."""

393

self._data.clear_cache()

394

395

def copy_to(self, name, transport):

396

"""See VersionedFile.copy_to()."""

397

# copy the current index to a temp index to avoid racing with local

398

# writes

399

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)

400

# copy the data file

401

f = self._data._open_file()

402

try:

403

transport.put(name + DATA_SUFFIX, f)

404

finally:

405

f.close()

406

# move the copied index into place

407

transport.move(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

408

409

def create_empty(self, name, transport, mode=None):

410

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

411

412

def _fix_parents(self, version, new_parents):

413

"""Fix the parents list for version.

414

415

This is done by appending a new version to the index

416

with identical data except for the parents list.

417

the parents list must be a superset of the current

418

list.

419

"""

420

current_values = self._index._cache[version]

421

assert set(current_values[4]).difference(set(new_parents)) == set()

422

self._index.add_version(version,

423

current_values[1],

424

current_values[2],

425

current_values[3],

426

new_parents)

427

428

def get_delta(self, version_id):

429

"""Get a delta for constructing version from some other version."""

430

if not self.has_version(version_id):

431

raise RevisionNotPresent(version_id, self.filename)

432

433

parents = self.get_parents(version_id)

434

if len(parents):

435

parent = parents[0]

436

else:

437

parent = None

438

data_pos, data_size = self._index.get_position(version_id)

439

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

440

version_idx = self._index.lookup(version_id)

441

noeol = 'no-eol' in self._index.get_options(version_id)

442

if 'fulltext' == self._index.get_method(version_id):

443

new_content = self.factory.parse_fulltext(data, version_idx)

444

if parent is not None:

445

reference_content = self._get_content(parent)

446

old_texts = reference_content.text()

447

else:

448

old_texts = []

449

new_texts = new_content.text()

450

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

451

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

452

else:

453

delta = self.factory.parse_line_delta(data, version_idx)

454

return parent, sha1, noeol, delta

455

456

def get_graph_with_ghosts(self):

457

"""See VersionedFile.get_graph_with_ghosts()."""

458

graph_items = self._index.get_graph()

459

return dict(graph_items)

460

461

def get_sha1(self, version_id):

462

"""See VersionedFile.get_sha1()."""

463

record_map = self._get_record_map([version_id])

464

method, content, digest, next = record_map[version_id]

465

return digest

466

467

@staticmethod

468

def get_suffixes():

469

"""See VersionedFile.get_suffixes()."""

470

return [DATA_SUFFIX, INDEX_SUFFIX]

471

472

def has_ghost(self, version_id):

473

"""True if there is a ghost reference in the file to version_id."""

474

# maybe we have it

475

if self.has_version(version_id):

476

return False

477

# optimisable if needed by memoising the _ghosts set.

478

items = self._index.get_graph()

479

for node, parents in items:

480

for parent in parents:

481

if parent not in self._index._cache:

482

if parent == version_id:

483

return True

484

return False

485

486

def versions(self):

487

"""See VersionedFile.versions."""

488

return self._index.get_versions()

489

490

def has_version(self, version_id):

491

"""See VersionedFile.has_version."""

492

return self._index.has_version(version_id)

493

494

__contains__ = has_version

495

496

def _merge_annotations(self, content, parents, parent_texts={},

497

delta=None, annotated=None):

498

"""Merge annotations for content. This is done by comparing

499

the annotations based on changed to the text.

500

"""

501

if annotated:

502

delta_seq = None

503

for parent_id in parents:

504

merge_content = self._get_content(parent_id, parent_texts)

505

seq = KnitSequenceMatcher(None, merge_content.text(), content.text())

506

if delta_seq is None:

507

# setup a delta seq to reuse.

508

delta_seq = seq

509

for i, j, n in seq.get_matching_blocks():

510

if n == 0:

511

continue

512

# this appears to copy (origin, text) pairs across to the new

513

# content for any line that matches the last-checked parent.

514

# FIXME: save the sequence control data for delta compression

515

# against the most relevant parent rather than rediffing.

516

content._lines[j:j+n] = merge_content._lines[i:i+n]

517

if delta:

518

if not annotated:

519

reference_content = self._get_content(parents[0], parent_texts)

520

new_texts = content.text()

521

old_texts = reference_content.text()

522

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

523

return self._make_line_delta(delta_seq, content)

524

525

def _make_line_delta(self, delta_seq, new_content):

526

"""Generate a line delta from delta_seq and new_content."""

527

diff_hunks = []

528

for op in delta_seq.get_opcodes():

529

if op[0] == 'equal':

530

continue

531

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

532

return diff_hunks

533

534

def _get_components_positions(self, version_ids):

535

"""Produce a map of position data for the components of versions.

536

537

This data is intended to be used for retrieving the knit records.

538

539

A dict of version_id to (method, data_pos, data_size, next) is

540

returned.

541

method is the way referenced data should be applied.

542

data_pos is the position of the data in the knit.

543

data_size is the size of the data in the knit.

544

next is the build-parent of the version, or None for fulltexts.

545

"""

546

component_data = {}

547

for version_id in version_ids:

548

cursor = version_id

549

550

while cursor is not None and cursor not in component_data:

551

method = self._index.get_method(cursor)

552

if method == 'fulltext':

553

next = None

554

else:

555

next = self.get_parents(cursor)[0]

556

data_pos, data_size = self._index.get_position(cursor)

557

component_data[cursor] = (method, data_pos, data_size, next)

558

cursor = next

559

return component_data

560

561

def _get_content(self, version_id, parent_texts={}):

562

"""Returns a content object that makes up the specified

563

version."""

564

if not self.has_version(version_id):

565

raise RevisionNotPresent(version_id, self.filename)

566

567

cached_version = parent_texts.get(version_id, None)

568

if cached_version is not None:

569

return cached_version

570

571

text_map, contents_map = self._get_content_maps([version_id])

572

return contents_map[version_id]

573

574

def _check_versions_present(self, version_ids):

575

"""Check that all specified versions are present."""

576

version_ids = set(version_ids)

577

for r in list(version_ids):

578

if self._index.has_version(r):

579

version_ids.remove(r)

580

if version_ids:

581

raise RevisionNotPresent(list(version_ids)[0], self.filename)

582

583

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

584

"""See VersionedFile.add_lines_with_ghosts()."""

585

self._check_add(version_id, lines)

586

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

587

588

def _add_lines(self, version_id, parents, lines, parent_texts):

589

"""See VersionedFile.add_lines."""

590

self._check_add(version_id, lines)

591

self._check_versions_present(parents)

592

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

593

594

def _check_add(self, version_id, lines):

595

"""check that version_id and lines are safe to add."""

596

assert self.writable, "knit is not opened for write"

597

### FIXME escape. RBC 20060228

598

if contains_whitespace(version_id):

599

raise InvalidRevisionId(version_id, self.filename)

600

if self.has_version(version_id):

601

raise RevisionAlreadyPresent(version_id, self.filename)

602

self._check_lines_not_unicode(lines)

603

self._check_lines_are_lines(lines)

604

605

def _add(self, version_id, lines, parents, delta, parent_texts):

606

"""Add a set of lines on top of version specified by parents.

607

608

If delta is true, compress the text as a line-delta against

609

the first parent.

610

611

Any versions not present will be converted into ghosts.

612

"""

613

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

614

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

615

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

616

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

617

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

618

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

619

# +1383 0 8.0370 8.0370 +<len>

620

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

621

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

622

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

623

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

624

625

present_parents = []

626

ghosts = []

627

if parent_texts is None:

628

parent_texts = {}

629

for parent in parents:

630

if not self.has_version(parent):

631

ghosts.append(parent)

632

else:

633

present_parents.append(parent)

634

635

if delta and not len(present_parents):

636

delta = False

637

638

digest = sha_strings(lines)

639

options = []

640

if lines:

641

if lines[-1][-1] != '\n':

642

options.append('no-eol')

643

lines[-1] = lines[-1] + '\n'

644

645

if len(present_parents) and delta:

646

# To speed the extract of texts the delta chain is limited

647

# to a fixed number of deltas. This should minimize both

648

# I/O and the time spend applying deltas.

649

count = 0

650

delta_parents = present_parents

651

while count < 25:

652

parent = delta_parents[0]

653

method = self._index.get_method(parent)

654

if method == 'fulltext':

655

break

656

delta_parents = self._index.get_parents(parent)

657

count = count + 1

658

if method == 'line-delta':

659

delta = False

660

661

lines = self.factory.make(lines, version_id)

662

if delta or (self.factory.annotated and len(present_parents) > 0):

663

# Merge annotations from parent texts if so is needed.

664

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

665

delta, self.factory.annotated)

666

667

if delta:

668

options.append('line-delta')

669

store_lines = self.factory.lower_line_delta(delta_hunks)

670

else:

671

options.append('fulltext')

672

store_lines = self.factory.lower_fulltext(lines)

673

674

where, size = self._data.add_record(version_id, digest, store_lines)

675

self._index.add_version(version_id, options, where, size, parents)

676

return lines

677

678

def check(self, progress_bar=None):

679

"""See VersionedFile.check()."""

680

681

def _clone_text(self, new_version_id, old_version_id, parents):

682

"""See VersionedFile.clone_text()."""

683

# FIXME RBC 20060228 make fast by only inserting an index with null

684

# delta.

685

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

686

687

def get_lines(self, version_id):

688

"""See VersionedFile.get_lines()."""

689

return self.get_line_list([version_id])[0]

690

691

def _get_record_map(self, version_ids):

692

"""Produce a dictionary of knit records.

693

694

The keys are version_ids, the values are tuples of (method, content,

695

digest, next).

696

method is the way the content should be applied.

697

content is a KnitContent object.

698

digest is the SHA1 digest of this version id after all steps are done

699

next is the build-parent of the version, i.e. the leftmost ancestor.

700

If the method is fulltext, next will be None.

701

"""

702

position_map = self._get_components_positions(version_ids)

703

# c = component_id, m = method, p = position, s = size, n = next

704

records = [(c, p, s) for c, (m, p, s, n) in position_map.iteritems()]

705

record_map = {}

706

for component_id, content, digest in \

707

self._data.read_records_iter(records):

708

method, position, size, next = position_map[component_id]

709

record_map[component_id] = method, content, digest, next

710

711

return record_map

712

713

def get_text(self, version_id):

714

"""See VersionedFile.get_text"""

715

return self.get_texts([version_id])[0]

716

717

def get_texts(self, version_ids):

718

return [''.join(l) for l in self.get_line_list(version_ids)]

719

720

def get_line_list(self, version_ids):

721

"""Return the texts of listed versions as a list of strings."""

722

text_map, content_map = self._get_content_maps(version_ids)

723

return [text_map[v] for v in version_ids]

724

725

def _get_content_maps(self, version_ids):

726

"""Produce maps of text and KnitContents

727

728

:return: (text_map, content_map) where text_map contains the texts for

729

the requested versions and content_map contains the KnitContents.

730

Both dicts take version_ids as their keys.

731

"""

732

for version_id in version_ids:

733

if not self.has_version(version_id):

734

raise RevisionNotPresent(version_id, self.filename)

735

record_map = self._get_record_map(version_ids)

736

737

text_map = {}

738

content_map = {}

739

final_content = {}

740

for version_id in version_ids:

741

components = []

742

cursor = version_id

743

while cursor is not None:

744

method, data, digest, next = record_map[cursor]

745

components.append((cursor, method, data, digest))

746

if cursor in content_map:

747

break

748

cursor = next

749

750

content = None

751

for component_id, method, data, digest in reversed(components):

752

if component_id in content_map:

753

content = content_map[component_id]

754

else:

755

version_idx = self._index.lookup(component_id)

756

if method == 'fulltext':

757

assert content is None

758

content = self.factory.parse_fulltext(data, version_idx)

759

elif method == 'line-delta':

760

delta = self.factory.parse_line_delta(data[:],

761

version_idx)

762

content = content.copy()

763

content._lines = self._apply_delta(content._lines,

764

delta)

765

content_map[component_id] = content

766

767

if 'no-eol' in self._index.get_options(version_id):

768

content = content.copy()

769

line = content._lines[-1][1].rstrip('\n')

770

content._lines[-1] = (content._lines[-1][0], line)

771

final_content[version_id] = content

772

773

# digest here is the digest from the last applied component.

774

text = content.text()

775

if sha_strings(text) != digest:

776

raise KnitCorrupt(self.filename,

777

'sha-1 does not match %s' % version_id)

778

779

text_map[version_id] = text

780

return text_map, final_content

781

782

def iter_lines_added_or_present_in_versions(self, version_ids=None):

783

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

784

if version_ids is None:

785

version_ids = self.versions()

786

# we don't care about inclusions, the caller cares.

787

# but we need to setup a list of records to visit.

788

# we need version_id, position, length

789

version_id_records = []

790

requested_versions = list(version_ids)

791

# filter for available versions

792

for version_id in requested_versions:

793

if not self.has_version(version_id):

794

raise RevisionNotPresent(version_id, self.filename)

795

# get a in-component-order queue:

796

version_ids = []

797

for version_id in self.versions():

798

if version_id in requested_versions:

799

version_ids.append(version_id)

800

data_pos, length = self._index.get_position(version_id)

801

version_id_records.append((version_id, data_pos, length))

802

803

pb = bzrlib.ui.ui_factory.nested_progress_bar()

804

count = 0

805

total = len(version_id_records)

806

try:

807

pb.update('Walking content.', count, total)

808

for version_id, data, sha_value in \

809

self._data.read_records_iter(version_id_records):

810

pb.update('Walking content.', count, total)

811

method = self._index.get_method(version_id)

812

version_idx = self._index.lookup(version_id)

813

assert method in ('fulltext', 'line-delta')

814

if method == 'fulltext':

815

content = self.factory.parse_fulltext(data, version_idx)

816

for line in content.text():

817

yield line

818

else:

819

delta = self.factory.parse_line_delta(data, version_idx)

820

for start, end, count, lines in delta:

821

for origin, line in lines:

822

yield line

823

count +=1

824

pb.update('Walking content.', total, total)

825

pb.finished()

826

except:

827

pb.update('Walking content.', total, total)

828

pb.finished()

829

raise

830

831

def num_versions(self):

832

"""See VersionedFile.num_versions()."""

833

return self._index.num_versions()

834

835

__len__ = num_versions

836

837

def annotate_iter(self, version_id):

838

"""See VersionedFile.annotate_iter."""

839

content = self._get_content(version_id)

840

for origin, text in content.annotate_iter():

841

yield origin, text

842

843

def get_parents(self, version_id):

844

"""See VersionedFile.get_parents."""

845

# perf notes:

846

# optimism counts!

847

# 52554 calls in 1264 872 internal down from 3674

848

try:

849

return self._index.get_parents(version_id)

850

except KeyError:

851

raise RevisionNotPresent(version_id, self.filename)

852

853

def get_parents_with_ghosts(self, version_id):

854

"""See VersionedFile.get_parents."""

855

try:

856

return self._index.get_parents_with_ghosts(version_id)

857

except KeyError:

858

raise RevisionNotPresent(version_id, self.filename)

859

860

def get_ancestry(self, versions):

861

"""See VersionedFile.get_ancestry."""

862

if isinstance(versions, basestring):

863

versions = [versions]

864

if not versions:

865

return []

866

self._check_versions_present(versions)

867

return self._index.get_ancestry(versions)

868

869

def get_ancestry_with_ghosts(self, versions):

870

"""See VersionedFile.get_ancestry_with_ghosts."""

871

if isinstance(versions, basestring):

872

versions = [versions]

873

if not versions:

874

return []

875

self._check_versions_present(versions)

876

return self._index.get_ancestry_with_ghosts(versions)

877

878

#@deprecated_method(zero_eight)

879

def walk(self, version_ids):

880

"""See VersionedFile.walk."""

881

# We take the short path here, and extract all relevant texts

882

# and put them in a weave and let that do all the work. Far

883

# from optimal, but is much simpler.

884

# FIXME RB 20060228 this really is inefficient!

885

from bzrlib.weave import Weave

886

887

w = Weave(self.filename)

888

ancestry = self.get_ancestry(version_ids)

889

sorted_graph = topo_sort(self._index.get_graph())

890

version_list = [vid for vid in sorted_graph if vid in ancestry]

891

892

for version_id in version_list:

893

lines = self.get_lines(version_id)

894

w.add_lines(version_id, self.get_parents(version_id), lines)

895

896

for lineno, insert_id, dset, line in w.walk(version_ids):

897

yield lineno, insert_id, dset, line

898

899

def plan_merge(self, ver_a, ver_b):

900

"""See VersionedFile.plan_merge."""

901

ancestors_b = set(self.get_ancestry(ver_b))

902

def status_a(revision, text):

903

if revision in ancestors_b:

904

return 'killed-b', text

905

else:

906

return 'new-a', text

907

908

ancestors_a = set(self.get_ancestry(ver_a))

909

def status_b(revision, text):

910

if revision in ancestors_a:

911

return 'killed-a', text

912

else:

913

return 'new-b', text

914

915

annotated_a = self.annotate(ver_a)

916

annotated_b = self.annotate(ver_b)

917

plain_a = [t for (a, t) in annotated_a]

918

plain_b = [t for (a, t) in annotated_b]

919

blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

920

a_cur = 0

921

b_cur = 0

922

for ai, bi, l in blocks:

923

# process all mismatched sections

924

# (last mismatched section is handled because blocks always

925

# includes a 0-length last block)

926

for revision, text in annotated_a[a_cur:ai]:

927

yield status_a(revision, text)

928

for revision, text in annotated_b[b_cur:bi]:

929

yield status_b(revision, text)

930

931

# and now the matched section

932

a_cur = ai + l

933

b_cur = bi + l

934

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

935

assert text_a == text_b

936

yield "unchanged", text_a

937

938

939

class _KnitComponentFile(object):

940

"""One of the files used to implement a knit database"""

941

942

def __init__(self, transport, filename, mode, file_mode=None):

943

self._transport = transport

944

self._filename = filename

945

self._mode = mode

946

self._file_mode=file_mode

947

948

def write_header(self):

949

if self._transport.append(self._filename, StringIO(self.HEADER),

950

mode=self._file_mode):

951

raise KnitCorrupt(self._filename, 'misaligned after writing header')

952

953

def check_header(self, fp):

954

line = fp.readline()

955

if line != self.HEADER:

956

raise KnitHeaderError(badline=line)

957

958

def commit(self):

959

"""Commit is a nop."""

960

961

def __repr__(self):

962

return '%s(%s)' % (self.__class__.__name__, self._filename)

963

964

965

class _KnitIndex(_KnitComponentFile):

966

"""Manages knit index file.

967

968

The index is already kept in memory and read on startup, to enable

969

fast lookups of revision information. The cursor of the index

970

file is always pointing to the end, making it easy to append

971

entries.

972

973

_cache is a cache for fast mapping from version id to a Index

974

object.

975

976

_history is a cache for fast mapping from indexes to version ids.

977

978

The index data format is dictionary compressed when it comes to

979

parent references; a index entry may only have parents that with a

980

lover index number. As a result, the index is topological sorted.

981

982

Duplicate entries may be written to the index for a single version id

983

if this is done then the latter one completely replaces the former:

984

this allows updates to correct version and parent information.

985

Note that the two entries may share the delta, and that successive

986

annotations and references MUST point to the first entry.

987

988

The index file on disc contains a header, followed by one line per knit

989

record. The same revision can be present in an index file more than once.

990

The first occurrence gets assigned a sequence number starting from 0.

991

992

The format of a single line is

993

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

994

REVISION_ID is a utf8-encoded revision id

995

FLAGS is a comma separated list of flags about the record. Values include

996

no-eol, line-delta, fulltext.

997

BYTE_OFFSET is the ascii representation of the byte offset in the data file

998

that the the compressed data starts at.

999

LENGTH is the ascii representation of the length of the data file.

1000

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

1001

REVISION_ID.

1002

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

1003

revision id already in the knit that is a parent of REVISION_ID.

1004

The ' :' marker is the end of record marker.

1005

1006

partial writes:

1007

when a write is interrupted to the index file, it will result in a line that

1008

does not end in ' :'. If the ' :' is not present at the end of a line, or at

1009

the end of the file, then the record that is missing it will be ignored by

1010

the parser.

1011

1012

When writing new records to the index file, the data is preceded by '\n'

1013

to ensure that records always start on new lines even if the last write was

1014

interrupted. As a result its normal for the last line in the index to be

1015

missing a trailing newline. One can be added with no harmful effects.

1016

"""

1017

1018

HEADER = "# bzr knit index 8\n"

1019

1020

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

1021

# __slots__ = ['_cache', '_history', '_transport', '_filename']

1022

1023

def _cache_version(self, version_id, options, pos, size, parents):

1024

"""Cache a version record in the history array and index cache.

1025

1026

This is inlined into __init__ for performance. KEEP IN SYNC.

1027

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1028

indexes).

1029

"""

1030

# only want the _history index to reference the 1st index entry

1031

# for version_id

1032

if version_id not in self._cache:

1033

index = len(self._history)

1034

self._history.append(version_id)

1035

else:

1036

index = self._cache[version_id][5]

1037

self._cache[version_id] = (version_id,

1038

options,

1039

pos,

1040

size,

1041

parents,

1042

index)

1043

1044

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1045

_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)

1046

self._cache = {}

1047

# position in _history is the 'official' index for a revision

1048

# but the values may have come from a newer entry.

1049

# so - wc -l of a knit index is != the number of unique names

1050

# in the knit.

1051

self._history = []

1052

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1053

try:

1054

count = 0

1055

total = 1

1056

try:

1057

pb.update('read knit index', count, total)

1058

fp = self._transport.get(self._filename)

1059

try:

1060

self.check_header(fp)

1061

# readlines reads the whole file at once:

1062

# bad for transports like http, good for local disk

1063

# we save 60 ms doing this one change (

1064

# from calling readline each time to calling

1065

# readlines once.

1066

# probably what we want for nice behaviour on

1067

# http is a incremental readlines that yields, or

1068

# a check for local vs non local indexes,

1069

for l in fp.readlines():

1070

rec = l.split()

1071

if len(rec) < 5 or rec[-1] != ':':

1072

# corrupt line.

1073

# FIXME: in the future we should determine if its a

1074

# short write - and ignore it

1075

# or a different failure, and raise. RBC 20060407

1076

continue

1077

count += 1

1078

total += 1

1079

#pb.update('read knit index', count, total)

1080

# See self._parse_parents

1081

parents = []

1082

for value in rec[4:-1]:

1083

if '.' == value[0]:

1084

# uncompressed reference

1085

parents.append(value[1:])

1086

else:

1087

# this is 15/4000ms faster than isinstance,

1088

# (in lsprof)

1089

# this function is called thousands of times a

1090

# second so small variations add up.

1091

assert value.__class__ is str

1092

parents.append(self._history[int(value)])

1093

# end self._parse_parents

1094

# self._cache_version(rec[0],

1095

# rec[1].split(','),

1096

# int(rec[2]),

1097

# int(rec[3]),

1098

# parents)

1099

# --- self._cache_version

1100

# only want the _history index to reference the 1st

1101

# index entry for version_id

1102

version_id = rec[0]

1103

if version_id not in self._cache:

1104

index = len(self._history)

1105

self._history.append(version_id)

1106

else:

1107

index = self._cache[version_id][5]

1108

self._cache[version_id] = (version_id,

1109

rec[1].split(','),

1110

int(rec[2]),

1111

int(rec[3]),

1112

parents,

1113

index)

1114

# --- self._cache_version

1115

finally:

1116

fp.close()

1117

except NoSuchFile, e:

1118

if mode != 'w' or not create:

1119

raise

1120

self.write_header()

1121

finally:

1122

pb.update('read knit index', total, total)

1123

pb.finished()

1124

1125

def _parse_parents(self, compressed_parents):

1126

"""convert a list of string parent values into version ids.

1127

1128

ints are looked up in the index.

1129

.FOO values are ghosts and converted in to FOO.

1130

1131

NOTE: the function is retained here for clarity, and for possible

1132

use in partial index reads. However bulk processing now has

1133

it inlined in __init__ for inner-loop optimisation.

1134

"""

1135

result = []

1136

for value in compressed_parents:

1137

if value[-1] == '.':

1138

# uncompressed reference

1139

result.append(value[1:])

1140

else:

1141

# this is 15/4000ms faster than isinstance,

1142

# this function is called thousands of times a

1143

# second so small variations add up.

1144

assert value.__class__ is str

1145

result.append(self._history[int(value)])

1146

return result

1147

1148

def get_graph(self):

1149

graph = []

1150

for version_id, index in self._cache.iteritems():

1151

graph.append((version_id, index[4]))

1152

return graph

1153

1154

def get_ancestry(self, versions):

1155

"""See VersionedFile.get_ancestry."""

1156

# get a graph of all the mentioned versions:

1157

graph = {}

1158

pending = set(versions)

1159

while len(pending):

1160

version = pending.pop()

1161

parents = self._cache[version][4]

1162

# got the parents ok

1163

# trim ghosts

1164

parents = [parent for parent in parents if parent in self._cache]

1165

for parent in parents:

1166

# if not completed and not a ghost

1167

if parent not in graph:

1168

pending.add(parent)

1169

graph[version] = parents

1170

return topo_sort(graph.items())

1171

1172

def get_ancestry_with_ghosts(self, versions):

1173

"""See VersionedFile.get_ancestry_with_ghosts."""

1174

# get a graph of all the mentioned versions:

1175

graph = {}

1176

pending = set(versions)

1177

while len(pending):

1178

version = pending.pop()

1179

try:

1180

parents = self._cache[version][4]

1181

except KeyError:

1182

# ghost, fake it

1183

graph[version] = []

1184

pass

1185

else:

1186

# got the parents ok

1187

for parent in parents:

1188

if parent not in graph:

1189

pending.add(parent)

1190

graph[version] = parents

1191

return topo_sort(graph.items())

1192

1193

def num_versions(self):

1194

return len(self._history)

1195

1196

__len__ = num_versions

1197

1198

def get_versions(self):

1199

return self._history

1200

1201

def idx_to_name(self, idx):

1202

return self._history[idx]

1203

1204

def lookup(self, version_id):

1205

assert version_id in self._cache

1206

return self._cache[version_id][5]

1207

1208

def _version_list_to_index(self, versions):

1209

result_list = []

1210

for version in versions:

1211

if version in self._cache:

1212

# -- inlined lookup() --

1213

result_list.append(str(self._cache[version][5]))

1214

# -- end lookup () --

1215

else:

1216

result_list.append('.' + version.encode('utf-8'))

1217

return ' '.join(result_list)

1218

1219

def add_version(self, version_id, options, pos, size, parents):

1220

"""Add a version record to the index."""

1221

self.add_versions(((version_id, options, pos, size, parents),))

1222

1223

def add_versions(self, versions):

1224

"""Add multiple versions to the index.

1225

1226

:param versions: a list of tuples:

1227

(version_id, options, pos, size, parents).

1228

"""

1229

lines = []

1230

for version_id, options, pos, size, parents in versions:

1231

line = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),

1232

','.join(options),

1233

pos,

1234

size,

1235

self._version_list_to_index(parents))

1236

assert isinstance(line, str), \

1237

'content must be utf-8 encoded: %r' % (line,)

1238

lines.append(line)

1239

self._transport.append(self._filename, StringIO(''.join(lines)))

1240

# cache after writing, so that a failed write leads to missing cache

1241

# entries not extra ones. XXX TODO: RBC 20060502 in the event of a

1242

# failure, reload the index or flush it or some such, to prevent

1243

# writing records that did complete twice.

1244

for version_id, options, pos, size, parents in versions:

1245

self._cache_version(version_id, options, pos, size, parents)

1246

1247

def has_version(self, version_id):

1248

"""True if the version is in the index."""

1249

return self._cache.has_key(version_id)

1250

1251

def get_position(self, version_id):

1252

"""Return data position and size of specified version."""

1253

return (self._cache[version_id][2], \

1254

self._cache[version_id][3])

1255

1256

def get_method(self, version_id):

1257

"""Return compression method of specified version."""

1258

options = self._cache[version_id][1]

1259

if 'fulltext' in options:

1260

return 'fulltext'

1261

else:

1262

assert 'line-delta' in options

1263

return 'line-delta'

1264

1265

def get_options(self, version_id):

1266

return self._cache[version_id][1]

1267

1268

def get_parents(self, version_id):

1269

"""Return parents of specified version ignoring ghosts."""

1270

return [parent for parent in self._cache[version_id][4]

1271

if parent in self._cache]

1272

1273

def get_parents_with_ghosts(self, version_id):

1274

"""Return parents of specified version with ghosts."""

1275

return self._cache[version_id][4]

1276

1277

def check_versions_present(self, version_ids):

1278

"""Check that all specified versions are present."""

1279

version_ids = set(version_ids)

1280

for version_id in list(version_ids):

1281

if version_id in self._cache:

1282

version_ids.remove(version_id)

1283

if version_ids:

1284

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1285

1286

1287

class _KnitData(_KnitComponentFile):

1288

"""Contents of the knit data file"""

1289

1290

HEADER = "# bzr knit data 8\n"

1291

1292

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1293

_KnitComponentFile.__init__(self, transport, filename, mode)

1294

self._checked = False

1295

# TODO: jam 20060713 conceptually, this could spill to disk

1296

# if the cached size gets larger than a certain amount

1297

# but it complicates the model a bit, so for now just use

1298

# a simple dictionary

1299

self._cache = {}

1300

self._do_cache = False

1301

if create:

1302

self._transport.put(self._filename, StringIO(''), mode=file_mode)

1303

1304

def enable_cache(self):

1305

"""Enable caching of reads."""

1306

self._do_cache = True

1307

1308

def clear_cache(self):

1309

"""Clear the record cache."""

1310

self._do_cache = False

1311

self._cache = {}

1312

1313

def _open_file(self):

1314

try:

1315

return self._transport.get(self._filename)

1316

except NoSuchFile:

1317

pass

1318

return None

1319

1320

def _record_to_data(self, version_id, digest, lines):

1321

"""Convert version_id, digest, lines into a raw data block.

1322

1323

:return: (len, a StringIO instance with the raw data ready to read.)

1324

"""

1325

sio = StringIO()

1326

data_file = GzipFile(None, mode='wb', fileobj=sio)

1327

data_file.writelines(chain(

1328

["version %s %d %s\n" % (version_id.encode('utf-8'),

1329

len(lines),

1330

digest)],

1331

lines,

1332

["end %s\n" % version_id.encode('utf-8')]))

1333

data_file.close()

1334

length= sio.tell()

1335

1336

sio.seek(0)

1337

return length, sio

1338

1339

def add_raw_record(self, raw_data):

1340

"""Append a prepared record to the data file.

1341

1342

:return: the offset in the data file raw_data was written.

1343

"""

1344

assert isinstance(raw_data, str), 'data must be plain bytes'

1345

return self._transport.append(self._filename, StringIO(raw_data))

1346

1347

def add_record(self, version_id, digest, lines):

1348

"""Write new text record to disk. Returns the position in the

1349

file where it was written."""

1350

size, sio = self._record_to_data(version_id, digest, lines)

1351

# write to disk

1352

start_pos = self._transport.append(self._filename, sio)

1353

if self._do_cache:

1354

self._cache[version_id] = sio.getvalue()

1355

return start_pos, size

1356

1357

def _parse_record_header(self, version_id, raw_data):

1358

"""Parse a record header for consistency.

1359

1360

:return: the header and the decompressor stream.

1361

as (stream, header_record)

1362

"""

1363

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1364

rec = df.readline().split()

1365

if len(rec) != 4:

1366

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1367

if rec[1].decode('utf-8')!= version_id:

1368

raise KnitCorrupt(self._filename,

1369

'unexpected version, wanted %r, got %r' % (

1370

version_id, rec[1]))

1371

return df, rec

1372

1373

def _parse_record(self, version_id, data):

1374

# profiling notes:

1375

# 4168 calls in 2880 217 internal

1376

# 4168 calls to _parse_record_header in 2121

1377

# 4168 calls to readlines in 330

1378

df, rec = self._parse_record_header(version_id, data)

1379

record_contents = df.readlines()

1380

l = record_contents.pop()

1381

assert len(record_contents) == int(rec[2])

1382

if l.decode('utf-8') != 'end %s\n' % version_id:

1383

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1384

% (l, version_id))

1385

df.close()

1386

return record_contents, rec[3]

1387

1388

def read_records_iter_raw(self, records):

1389

"""Read text records from data file and yield raw data.

1390

1391

This unpacks enough of the text record to validate the id is

1392

as expected but thats all.

1393

"""

1394

# setup an iterator of the external records:

1395

# uses readv so nice and fast we hope.

1396

if len(records):

1397

# grab the disk data needed.

1398

if self._cache:

1399

# Don't check _cache if it is empty

1400

needed_offsets = [(pos, size) for version_id, pos, size

1401

in records

1402

if version_id not in self._cache]

1403

else:

1404

needed_offsets = [(pos, size) for version_id, pos, size

1405

in records]

1406

1407

raw_records = self._transport.readv(self._filename, needed_offsets)

1408

1409

1410

for version_id, pos, size in records:

1411

if version_id in self._cache:

1412

# This data has already been validated

1413

data = self._cache[version_id]

1414

else:

1415

pos, data = raw_records.next()

1416

if self._do_cache:

1417

self._cache[version_id] = data

1418

1419

# validate the header

1420

df, rec = self._parse_record_header(version_id, data)

1421

df.close()

1422

yield version_id, data

1423

1424

def read_records_iter(self, records):

1425

"""Read text records from data file and yield result.

1426

1427

The result will be returned in whatever is the fastest to read.

1428

Not by the order requested. Also, multiple requests for the same

1429

record will only yield 1 response.

1430

:param records: A list of (version_id, pos, len) entries

1431

:return: Yields (version_id, contents, digest) in the order

1432

read, not the order requested

1433

"""

1434

if not records:

1435

return

1436

1437

if self._cache:

1438

# Skip records we have alread seen

1439

yielded_records = set()

1440

needed_records = set()

1441

for record in records:

1442

if record[0] in self._cache:

1443

if record[0] in yielded_records:

1444

continue

1445

yielded_records.add(record[0])

1446

data = self._cache[record[0]]

1447

content, digest = self._parse_record(record[0], data)

1448

yield (record[0], content, digest)

1449

else:

1450

needed_records.add(record)

1451

needed_records = sorted(needed_records, key=operator.itemgetter(1))

1452

else:

1453

needed_records = sorted(set(records), key=operator.itemgetter(1))

1454

1455

if not needed_records:

1456

return

1457

1458

# The transport optimizes the fetching as well

1459

# (ie, reads continuous ranges.)

1460

readv_response = self._transport.readv(self._filename,

1461

[(pos, size) for version_id, pos, size in needed_records])

1462

1463

for (version_id, pos, size), (pos, data) in \

1464

izip(iter(needed_records), readv_response):

1465

content, digest = self._parse_record(version_id, data)

1466

if self._do_cache:

1467

self._cache[version_id] = data

1468

yield version_id, content, digest

1469

1470

def read_records(self, records):

1471

"""Read records into a dictionary."""

1472

components = {}

1473

for record_id, content, digest in \

1474

self.read_records_iter(records):

1475

components[record_id] = (content, digest)

1476

return components

1477

1478

1479

class InterKnit(InterVersionedFile):

1480

"""Optimised code paths for knit to knit operations."""

1481

1482

_matching_file_from_factory = KnitVersionedFile

1483

_matching_file_to_factory = KnitVersionedFile

1484

1485

@staticmethod

1486

def is_compatible(source, target):

1487

"""Be compatible with knits. """

1488

try:

1489

return (isinstance(source, KnitVersionedFile) and

1490

isinstance(target, KnitVersionedFile))

1491

except AttributeError:

1492

return False

1493

1494

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1495

"""See InterVersionedFile.join."""

1496

assert isinstance(self.source, KnitVersionedFile)

1497

assert isinstance(self.target, KnitVersionedFile)

1498

1499

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1500

1501

if not version_ids:

1502

return 0

1503

1504

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1505

try:

1506

version_ids = list(version_ids)

1507

if None in version_ids:

1508

version_ids.remove(None)

1509

1510

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1511

this_versions = set(self.target._index.get_versions())

1512

needed_versions = self.source_ancestry - this_versions

1513

cross_check_versions = self.source_ancestry.intersection(this_versions)

1514

mismatched_versions = set()

1515

for version in cross_check_versions:

1516

# scan to include needed parents.

1517

n1 = set(self.target.get_parents_with_ghosts(version))

1518

n2 = set(self.source.get_parents_with_ghosts(version))

1519

if n1 != n2:

1520

# FIXME TEST this check for cycles being introduced works

1521

# the logic is we have a cycle if in our graph we are an

1522

# ancestor of any of the n2 revisions.

1523

for parent in n2:

1524

if parent in n1:

1525

# safe

1526

continue

1527

else:

1528

parent_ancestors = self.source.get_ancestry(parent)

1529

if version in parent_ancestors:

1530

raise errors.GraphCycleError([parent, version])

1531

# ensure this parent will be available later.

1532

new_parents = n2.difference(n1)

1533

needed_versions.update(new_parents.difference(this_versions))

1534

mismatched_versions.add(version)

1535

1536

if not needed_versions and not mismatched_versions:

1537

return 0

1538

full_list = topo_sort(self.source.get_graph())

1539

1540

version_list = [i for i in full_list if (not self.target.has_version(i)

1541

and i in needed_versions)]

1542

1543

# plan the join:

1544

copy_queue = []

1545

copy_queue_records = []

1546

copy_set = set()

1547

for version_id in version_list:

1548

options = self.source._index.get_options(version_id)

1549

parents = self.source._index.get_parents_with_ghosts(version_id)

1550

# check that its will be a consistent copy:

1551

for parent in parents:

1552

# if source has the parent, we must :

1553

# * already have it or

1554

# * have it scheduled already

1555

# otherwise we don't care

1556

assert (self.target.has_version(parent) or

1557

parent in copy_set or

1558

not self.source.has_version(parent))

1559

data_pos, data_size = self.source._index.get_position(version_id)

1560

copy_queue_records.append((version_id, data_pos, data_size))

1561

copy_queue.append((version_id, options, parents))

1562

copy_set.add(version_id)

1563

1564

# data suck the join:

1565

count = 0

1566

total = len(version_list)

1567

raw_datum = []

1568

raw_records = []

1569

for (version_id, raw_data), \

1570

(version_id2, options, parents) in \

1571

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1572

copy_queue):

1573

assert version_id == version_id2, 'logic error, inconsistent results'

1574

count = count + 1

1575

pb.update("Joining knit", count, total)

1576

raw_records.append((version_id, options, parents, len(raw_data)))

1577

raw_datum.append(raw_data)

1578

self.target._add_raw_records(raw_records, ''.join(raw_datum))

1579

1580

for version in mismatched_versions:

1581

# FIXME RBC 20060309 is this needed?

1582

n1 = set(self.target.get_parents_with_ghosts(version))

1583

n2 = set(self.source.get_parents_with_ghosts(version))

1584

# write a combined record to our history preserving the current

1585

# parents as first in the list

1586

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1587

self.target.fix_parents(version, new_parents)

1588

return count

1589

finally:

1590

pb.finished()

1591

1592

1593

InterVersionedFile.register_optimiser(InterKnit)

1594

1595

1596

class WeaveToKnit(InterVersionedFile):

1597

"""Optimised code paths for weave to knit operations."""

1598

1599

_matching_file_from_factory = bzrlib.weave.WeaveFile

1600

_matching_file_to_factory = KnitVersionedFile

1601

1602

@staticmethod

1603

def is_compatible(source, target):

1604

"""Be compatible with weaves to knits."""

1605

try:

1606

return (isinstance(source, bzrlib.weave.Weave) and

1607

isinstance(target, KnitVersionedFile))

1608

except AttributeError:

1609

return False

1610

1611

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1612

"""See InterVersionedFile.join."""

1613

assert isinstance(self.source, bzrlib.weave.Weave)

1614

assert isinstance(self.target, KnitVersionedFile)

1615

1616

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1617

1618

if not version_ids:

1619

return 0

1620

1621

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1622

try:

1623

version_ids = list(version_ids)

1624

1625

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1626

this_versions = set(self.target._index.get_versions())

1627

needed_versions = self.source_ancestry - this_versions

1628

cross_check_versions = self.source_ancestry.intersection(this_versions)

1629

mismatched_versions = set()

1630

for version in cross_check_versions:

1631

# scan to include needed parents.

1632

n1 = set(self.target.get_parents_with_ghosts(version))

1633

n2 = set(self.source.get_parents(version))

1634

# if all of n2's parents are in n1, then its fine.

1635

if n2.difference(n1):

1636

# FIXME TEST this check for cycles being introduced works

1637

# the logic is we have a cycle if in our graph we are an

1638

# ancestor of any of the n2 revisions.

1639

for parent in n2:

1640

if parent in n1:

1641

# safe

1642

continue

1643

else:

1644

parent_ancestors = self.source.get_ancestry(parent)

1645

if version in parent_ancestors:

1646

raise errors.GraphCycleError([parent, version])

1647

# ensure this parent will be available later.

1648

new_parents = n2.difference(n1)

1649

needed_versions.update(new_parents.difference(this_versions))

1650

mismatched_versions.add(version)

1651

1652

if not needed_versions and not mismatched_versions:

1653

return 0

1654

full_list = topo_sort(self.source.get_graph())

1655

1656

version_list = [i for i in full_list if (not self.target.has_version(i)

1657

and i in needed_versions)]

1658

1659

# do the join:

1660

count = 0

1661

total = len(version_list)

1662

for version_id in version_list:

1663

pb.update("Converting to knit", count, total)

1664

parents = self.source.get_parents(version_id)

1665

# check that its will be a consistent copy:

1666

for parent in parents:

1667

# if source has the parent, we must already have it

1668

assert (self.target.has_version(parent))

1669

self.target.add_lines(

1670

version_id, parents, self.source.get_lines(version_id))

1671

count = count + 1

1672

1673

for version in mismatched_versions:

1674

# FIXME RBC 20060309 is this needed?

1675

n1 = set(self.target.get_parents_with_ghosts(version))

1676

n2 = set(self.source.get_parents(version))

1677

# write a combined record to our history preserving the current

1678

# parents as first in the list

1679

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1680

self.target.fix_parents(version, new_parents)

1681

return count

1682

finally:

1683

pb.finished()

1684

1685

1686

InterVersionedFile.register_optimiser(WeaveToKnit)

1687

1688

1689

class KnitSequenceMatcher(difflib.SequenceMatcher):

1690

"""Knit tuned sequence matcher.

1691

1692

This is based on profiling of difflib which indicated some improvements

1693

for our usage pattern.

1694

"""

1695

1696

def find_longest_match(self, alo, ahi, blo, bhi):

1697

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1698

1699

If isjunk is not defined:

1700

1701

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1702

alo <= i <= i+k <= ahi

1703

blo <= j <= j+k <= bhi

1704

and for all (i',j',k') meeting those conditions,

1705

k >= k'

1706

i <= i'

1707

and if i == i', j <= j'

1708

1709

In other words, of all maximal matching blocks, return one that

1710

starts earliest in a, and of all those maximal matching blocks that

1711

start earliest in a, return the one that starts earliest in b.

1712

1713

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1714

>>> s.find_longest_match(0, 5, 0, 9)

1715

(0, 4, 5)

1716

1717

If isjunk is defined, first the longest matching block is

1718

determined as above, but with the additional restriction that no

1719

junk element appears in the block. Then that block is extended as

1720

far as possible by matching (only) junk elements on both sides. So

1721

the resulting block never matches on junk except as identical junk

1722

happens to be adjacent to an "interesting" match.

1723

1724

Here's the same example as before, but considering blanks to be

1725

junk. That prevents " abcd" from matching the " abcd" at the tail

1726

end of the second sequence directly. Instead only the "abcd" can

1727

match, and matches the leftmost "abcd" in the second sequence:

1728

1729

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1730

>>> s.find_longest_match(0, 5, 0, 9)

1731

(1, 0, 4)

1732

1733

If no blocks match, return (alo, blo, 0).

1734

1735

>>> s = SequenceMatcher(None, "ab", "c")

1736

>>> s.find_longest_match(0, 2, 0, 1)

1737

(0, 0, 0)

1738

"""

1739

1740

# CAUTION: stripping common prefix or suffix would be incorrect.

1741

# E.g.,

1742

# ab

1743

# acab

1744

# Longest matching block is "ab", but if common prefix is

1745

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1746

# strip, so ends up claiming that ab is changed to acab by

1747

# inserting "ca" in the middle. That's minimal but unintuitive:

1748

# "it's obvious" that someone inserted "ac" at the front.

1749

# Windiff ends up at the same place as diff, but by pairing up

1750

# the unique 'b's and then matching the first two 'a's.

1751

1752

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1753

besti, bestj, bestsize = alo, blo, 0

1754

# find longest junk-free match

1755

# during an iteration of the loop, j2len[j] = length of longest

1756

# junk-free match ending with a[i-1] and b[j]

1757

j2len = {}

1758

# nothing = []

1759

b2jget = b2j.get

1760

for i in xrange(alo, ahi):

1761

# look at all instances of a[i] in b; note that because

1762

# b2j has no junk keys, the loop is skipped if a[i] is junk

1763

j2lenget = j2len.get

1764

newj2len = {}

1765

1766

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

1767

# following improvement

1768

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1769

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1770

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1771

# to

1772

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1773

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1774

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1775

1776

try:

1777

js = b2j[a[i]]

1778

except KeyError:

1779

pass

1780

else:

1781

for j in js:

1782

# a[i] matches b[j]

1783

if j >= blo:

1784

if j >= bhi:

1785

break

1786

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1787

if k > bestsize:

1788

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1789

j2len = newj2len

1790

1791

# Extend the best by non-junk elements on each end. In particular,

1792

# "popular" non-junk elements aren't in b2j, which greatly speeds

1793

# the inner loop above, but also means "the best" match so far

1794

# doesn't contain any junk *or* popular non-junk elements.

1795

while besti > alo and bestj > blo and \

1796

not isbjunk(b[bestj-1]) and \

1797

a[besti-1] == b[bestj-1]:

1798

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1799

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1800

not isbjunk(b[bestj+bestsize]) and \

1801

a[besti+bestsize] == b[bestj+bestsize]:

1802

bestsize += 1

1803

1804

# Now that we have a wholly interesting match (albeit possibly

1805

# empty!), we may as well suck up the matching junk on each

1806

# side of it too. Can't think of a good reason not to, and it

1807

# saves post-processing the (possibly considerable) expense of

1808

# figuring out what to do with it. In the case of an empty

1809

# interesting match, this is clearly the right thing to do,

1810

# because no other kind of match is possible in the regions.

1811

while besti > alo and bestj > blo and \

1812

isbjunk(b[bestj-1]) and \

1813

a[besti-1] == b[bestj-1]:

1814

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1815

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1816

isbjunk(b[bestj+bestsize]) and \

1817

a[besti+bestsize] == b[bestj+bestsize]:

1818

bestsize = bestsize + 1

1819

1820

return besti, bestj, bestsize

1821

Older »