/brz/remove-bazaar : revision 1756.2.16

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Aaron Bentley
Date: 2006-06-17 18:44:05 UTC
mfrom: (1786 +trunk)
mto: This revision was merged to the branch mainline in revision 1788.
Revision ID: aaron.bentley@utoronto.ca-20060617184405-ba00b55631c7da57

Merge bzr.dev

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/benchmarks

bzrlib/benchmarks/__init__.py

bzrlib/benchmarks/bench_add.py

bzrlib/benchmarks/bench_bench.py

bzrlib/benchmarks/bench_checkout.py

bzrlib/benchmarks/bench_commit.py

bzrlib/benchmarks/bench_inventory.py

bzrlib/benchmarks/bench_log.py

bzrlib/benchmarks/bench_osutils.py

bzrlib/benchmarks/bench_rocks.py

bzrlib/benchmarks/bench_status.py

bzrlib/benchmarks/bench_transform.py

bzrlib/benchmarks/bench_workingtree.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bundle

bzrlib/bundle/__init__.py

bzrlib/bundle/apply_bundle.py

bzrlib/bundle/commands.py

bzrlib/bundle/common.py

bzrlib/bundle/old

bzrlib/bundle/old/send_changeset.py

bzrlib/bundle/read_bundle.py

bzrlib/bundle/serializer

bzrlib/bundle/serializer/__init__.py

bzrlib/bundle/serializer/v07.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/patches.py

bzrlib/patiencediff.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/plugins/launchpad

bzrlib/plugins/launchpad/__init__.py

bzrlib/plugins/launchpad/lp_registration.py

bzrlib/plugins/launchpad/test_register.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/EncodingAdapter.py

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_add.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_annotate.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_branch.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_command_encoding.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_ignored.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_non_ascii.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_remove.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_break_lock.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_break_lock.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bundle.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_emptytree.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_patch.py

bzrlib/tests/test_patches.py

bzrlib/tests/test_patches_data

bzrlib/tests/test_patches_data/diff

bzrlib/tests/test_patches_data/diff-2

bzrlib/tests/test_patches_data/diff-3

bzrlib/tests/test_patches_data/diff-4

bzrlib/tests/test_patches_data/diff-5

bzrlib/tests/test_patches_data/diff-6

bzrlib/tests/test_patches_data/insert_top.patch

bzrlib/tests/test_patches_data/mod

bzrlib/tests/test_patches_data/mod-2

bzrlib/tests/test_patches_data/mod-3

bzrlib/tests/test_patches_data/mod-4

bzrlib/tests/test_patches_data/mod-5

bzrlib/tests/test_patches_data/mod-6

bzrlib/tests/test_patches_data/orig

bzrlib/tests/test_patches_data/orig-2

bzrlib/tests/test_patches_data/orig-3

bzrlib/tests/test_patches_data/orig-4

bzrlib/tests/test_patches_data/orig-5

bzrlib/tests/test_patches_data/orig-6

bzrlib/tests/test_patches_data/patchtext.patch

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_read_bundle.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revisiontree.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_status.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textfile.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_urlutils.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_break_lock.py

bzrlib/tests/workingtree_implementations/test_commit.py

bzrlib/tests/workingtree_implementations/test_get_parent_ids.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_is_ignored.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textfile.py

bzrlib/textinv.py

bzrlib/textmerge.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/urlutils.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

doc/README.1st

doc/configuration.txt

doc/plugins.txt

doc/setting_up_email.txt

doc/specifying_revisions.txt

doc/tutorial.txt

doc/using_aliases.txt

generate_docs.py

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/doc_generate/autodoc_rstx.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

files removed:
.bzrignore

COPYING

INSTALL

Makefile

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

commands.py

converter.py

dir.py

errors.py

fetch.py

foreign

foreign/.bzrignore

foreign/TODO

foreign/__init__.py

foreign/test_versionedfiles.py

foreign/upgrade.py

foreign/versionedfiles.py

mapping.py

notes

notes/roundtripping.txt

remote.py

repository.py

revspec.py

server.py

setup.py

shamap.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_dir.py

tests/test_fetch.py

tests/test_ids.py

tests/test_repository.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# Modified by Aaron Bentley <aaron.bentley@utoronto.ca>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import os

import sys

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.tuned_gzip import *

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.tsort import topo_sort

import bzrlib.weave

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accommodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

# TODO: atomically append data, then measure backwards from the cursor

# position after writing to work out where it was located. we may need to

# bypass python file buffering.

100

101

DATA_SUFFIX = '.knit'

102

INDEX_SUFFIX = '.kndx'

103

104

105

class KnitContent(object):

106

"""Content of a knit version to which deltas can be applied."""

107

108

def __init__(self, lines):

109

self._lines = lines

110

111

def annotate_iter(self):

112

"""Yield tuples of (origin, text) for each content line."""

113

for origin, text in self._lines:

114

yield origin, text

115

116

def annotate(self):

117

"""Return a list of (origin, text) tuples."""

118

return list(self.annotate_iter())

119

120

def line_delta_iter(self, new_lines):

121

"""Generate line-based delta from this content to new_lines."""

122

new_texts = [text for origin, text in new_lines._lines]

123

old_texts = [text for origin, text in self._lines]

124

s = KnitSequenceMatcher(None, old_texts, new_texts)

125

for op in s.get_opcodes():

126

if op[0] == 'equal':

127

continue

128

# ofrom oto length data

129

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

130

131

def line_delta(self, new_lines):

132

return list(self.line_delta_iter(new_lines))

133

134

def text(self):

135

return [text for origin, text in self._lines]

136

137

138

class _KnitFactory(object):

139

"""Base factory for creating content objects."""

140

141

def make(self, lines, version):

142

num_lines = len(lines)

143

return KnitContent(zip([version] * num_lines, lines))

144

145

146

class KnitAnnotateFactory(_KnitFactory):

147

"""Factory for creating annotated Content objects."""

148

149

annotated = True

150

151

def parse_fulltext(self, content, version):

152

"""Convert fulltext to internal representation

153

154

fulltext content is of the format

155

revid(utf8) plaintext\n

156

internal representation is of the format:

157

(revid, plaintext)

158

"""

159

lines = []

160

for line in content:

161

origin, text = line.split(' ', 1)

162

lines.append((origin.decode('utf-8'), text))

163

return KnitContent(lines)

164

165

def parse_line_delta_iter(self, lines):

166

for result_item in self.parse_line_delta[lines]:

167

yield result_item

168

169

def parse_line_delta(self, lines, version):

170

"""Convert a line based delta into internal representation.

171

172

line delta is in the form of:

173

intstart intend intcount

174

1..count lines:

175

revid(utf8) newline\n

176

internal representation is

177

(start, end, count, [1..count tuples (revid, newline)])

178

"""

179

result = []

180

lines = iter(lines)

181

next = lines.next

182

# walk through the lines parsing.

183

for header in lines:

184

start, end, count = [int(n) for n in header.split(',')]

185

contents = []

186

remaining = count

187

while remaining:

188

origin, text = next().split(' ', 1)

189

remaining -= 1

190

contents.append((origin.decode('utf-8'), text))

191

result.append((start, end, count, contents))

192

return result

193

194

def lower_fulltext(self, content):

195

"""convert a fulltext content record into a serializable form.

196

197

see parse_fulltext which this inverts.

198

"""

199

return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]

200

201

def lower_line_delta(self, delta):

202

"""convert a delta into a serializable form.

203

204

See parse_line_delta which this inverts.

205

"""

206

out = []

207

for start, end, c, lines in delta:

208

out.append('%d,%d,%d\n' % (start, end, c))

209

for origin, text in lines:

210

out.append('%s %s' % (origin.encode('utf-8'), text))

211

return out

212

213

214

class KnitPlainFactory(_KnitFactory):

215

"""Factory for creating plain Content objects."""

216

217

annotated = False

218

219

def parse_fulltext(self, content, version):

220

"""This parses an unannotated fulltext.

221

222

Note that this is not a noop - the internal representation

223

has (versionid, line) - its just a constant versionid.

224

"""

225

return self.make(content, version)

226

227

def parse_line_delta_iter(self, lines, version):

228

while lines:

229

header = lines.pop(0)

230

start, end, c = [int(n) for n in header.split(',')]

231

yield start, end, c, zip([version] * c, lines[:c])

232

del lines[:c]

233

234

def parse_line_delta(self, lines, version):

235

return list(self.parse_line_delta_iter(lines, version))

236

237

def lower_fulltext(self, content):

238

return content.text()

239

240

def lower_line_delta(self, delta):

241

out = []

242

for start, end, c, lines in delta:

243

out.append('%d,%d,%d\n' % (start, end, c))

244

out.extend([text for origin, text in lines])

245

return out

246

247

248

def make_empty_knit(transport, relpath):

249

"""Construct a empty knit at the specified location."""

250

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

251

k._data._open_file()

252

253

254

class KnitVersionedFile(VersionedFile):

255

"""Weave-like structure with faster random access.

256

257

A knit stores a number of texts and a summary of the relationships

258

between them. Texts are identified by a string version-id. Texts

259

are normally stored and retrieved as a series of lines, but can

260

also be passed as single strings.

261

262

Lines are stored with the trailing newline (if any) included, to

263

avoid special cases for files with no final newline. Lines are

264

composed of 8-bit characters, not unicode. The combination of

265

these approaches should mean any 'binary' file can be safely

266

stored and retrieved.

267

"""

268

269

def __init__(self, relpath, transport, file_mode=None, access_mode=None,

270

factory=None, basis_knit=None, delta=True, create=False):

271

"""Construct a knit at location specified by relpath.

272

273

:param create: If not True, only open an existing knit.

274

"""

275

if access_mode is None:

276

access_mode = 'w'

277

super(KnitVersionedFile, self).__init__(access_mode)

278

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

279

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

280

type(basis_knit)

281

282

self.transport = transport

283

self.filename = relpath

284

self.basis_knit = basis_knit

285

self.factory = factory or KnitAnnotateFactory()

286

self.writable = (access_mode == 'w')

287

self.delta = delta

288

289

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

290

access_mode, create=create, file_mode=file_mode)

291

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

292

access_mode, create=create and not len(self), file_mode=file_mode)

293

294

def __repr__(self):

295

return '%s(%s)' % (self.__class__.__name__,

296

self.transport.abspath(self.filename))

297

298

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

299

"""See VersionedFile._add_delta()."""

300

self._check_add(version_id, []) # should we check the lines ?

301

self._check_versions_present(parents)

302

present_parents = []

303

ghosts = []

304

parent_texts = {}

305

for parent in parents:

306

if not self.has_version(parent):

307

ghosts.append(parent)

308

else:

309

present_parents.append(parent)

310

311

if delta_parent is None:

312

# reconstitute as full text.

313

assert len(delta) == 1 or len(delta) == 0

314

if len(delta):

315

assert delta[0][0] == 0

316

assert delta[0][1] == 0, delta[0][1]

317

return super(KnitVersionedFile, self)._add_delta(version_id,

318

parents,

319

delta_parent,

320

sha1,

321

noeol,

322

delta)

323

324

digest = sha1

325

326

options = []

327

if noeol:

328

options.append('no-eol')

329

330

if delta_parent is not None:

331

# determine the current delta chain length.

332

# To speed the extract of texts the delta chain is limited

333

# to a fixed number of deltas. This should minimize both

334

# I/O and the time spend applying deltas.

335

count = 0

336

delta_parents = [delta_parent]

337

while count < 25:

338

parent = delta_parents[0]

339

method = self._index.get_method(parent)

340

if method == 'fulltext':

341

break

342

delta_parents = self._index.get_parents(parent)

343

count = count + 1

344

if method == 'line-delta':

345

# did not find a fulltext in the delta limit.

346

# just do a normal insertion.

347

return super(KnitVersionedFile, self)._add_delta(version_id,

348

parents,

349

delta_parent,

350

sha1,

351

noeol,

352

delta)

353

354

options.append('line-delta')

355

store_lines = self.factory.lower_line_delta(delta)

356

357

where, size = self._data.add_record(version_id, digest, store_lines)

358

self._index.add_version(version_id, options, where, size, parents)

359

360

def _add_raw_records(self, records, data):

361

"""Add all the records 'records' with data pre-joined in 'data'.

362

363

:param records: A list of tuples(version_id, options, parents, size).

364

:param data: The data for the records. When it is written, the records

365

are adjusted to have pos pointing into data by the sum of

366

the preceding records sizes.

367

"""

368

# write all the data

369

pos = self._data.add_raw_record(data)

370

index_entries = []

371

for (version_id, options, parents, size) in records:

372

index_entries.append((version_id, options, pos, size, parents))

373

pos += size

374

self._index.add_versions(index_entries)

375

376

def clear_cache(self):

377

"""Clear the data cache only."""

378

self._data.clear_cache()

379

380

def copy_to(self, name, transport):

381

"""See VersionedFile.copy_to()."""

382

# copy the current index to a temp index to avoid racing with local

383

# writes

384

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename),)

385

# copy the data file

386

transport.put(name + DATA_SUFFIX, self._data._open_file())

387

# rename the copied index into place

388

transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

389

390

def create_empty(self, name, transport, mode=None):

391

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

392

393

def _fix_parents(self, version, new_parents):

394

"""Fix the parents list for version.

395

396

This is done by appending a new version to the index

397

with identical data except for the parents list.

398

the parents list must be a superset of the current

399

list.

400

"""

401

current_values = self._index._cache[version]

402

assert set(current_values[4]).difference(set(new_parents)) == set()

403

self._index.add_version(version,

404

current_values[1],

405

current_values[2],

406

current_values[3],

407

new_parents)

408

409

def get_delta(self, version_id):

410

"""Get a delta for constructing version from some other version."""

411

if not self.has_version(version_id):

412

raise RevisionNotPresent(version_id, self.filename)

413

414

parents = self.get_parents(version_id)

415

if len(parents):

416

parent = parents[0]

417

else:

418

parent = None

419

data_pos, data_size = self._index.get_position(version_id)

420

data, sha1 = self._data.read_records(((version_id, data_pos, data_size),))[version_id]

421

version_idx = self._index.lookup(version_id)

422

noeol = 'no-eol' in self._index.get_options(version_id)

423

if 'fulltext' == self._index.get_method(version_id):

424

new_content = self.factory.parse_fulltext(data, version_idx)

425

if parent is not None:

426

reference_content = self._get_content(parent)

427

old_texts = reference_content.text()

428

else:

429

old_texts = []

430

new_texts = new_content.text()

431

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

432

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

433

else:

434

delta = self.factory.parse_line_delta(data, version_idx)

435

return parent, sha1, noeol, delta

436

437

def get_graph_with_ghosts(self):

438

"""See VersionedFile.get_graph_with_ghosts()."""

439

graph_items = self._index.get_graph()

440

return dict(graph_items)

441

442

def get_sha1(self, version_id):

443

"""See VersionedFile.get_sha1()."""

444

components = self._get_components(version_id)

445

return components[-1][-1][-1]

446

447

@staticmethod

448

def get_suffixes():

449

"""See VersionedFile.get_suffixes()."""

450

return [DATA_SUFFIX, INDEX_SUFFIX]

451

452

def has_ghost(self, version_id):

453

"""True if there is a ghost reference in the file to version_id."""

454

# maybe we have it

455

if self.has_version(version_id):

456

return False

457

# optimisable if needed by memoising the _ghosts set.

458

items = self._index.get_graph()

459

for node, parents in items:

460

for parent in parents:

461

if parent not in self._index._cache:

462

if parent == version_id:

463

return True

464

return False

465

466

def versions(self):

467

"""See VersionedFile.versions."""

468

return self._index.get_versions()

469

470

def has_version(self, version_id):

471

"""See VersionedFile.has_version."""

472

return self._index.has_version(version_id)

473

474

__contains__ = has_version

475

476

def _merge_annotations(self, content, parents, parent_texts={},

477

delta=None, annotated=None):

478

"""Merge annotations for content. This is done by comparing

479

the annotations based on changed to the text.

480

"""

481

if annotated:

482

delta_seq = None

483

for parent_id in parents:

484

merge_content = self._get_content(parent_id, parent_texts)

485

seq = KnitSequenceMatcher(None, merge_content.text(), content.text())

486

if delta_seq is None:

487

# setup a delta seq to reuse.

488

delta_seq = seq

489

for i, j, n in seq.get_matching_blocks():

490

if n == 0:

491

continue

492

# this appears to copy (origin, text) pairs across to the new

493

# content for any line that matches the last-checked parent.

494

# FIXME: save the sequence control data for delta compression

495

# against the most relevant parent rather than rediffing.

496

content._lines[j:j+n] = merge_content._lines[i:i+n]

497

if delta:

498

if not annotated:

499

reference_content = self._get_content(parents[0], parent_texts)

500

new_texts = content.text()

501

old_texts = reference_content.text()

502

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

503

return self._make_line_delta(delta_seq, content)

504

505

def _make_line_delta(self, delta_seq, new_content):

506

"""Generate a line delta from delta_seq and new_content."""

507

diff_hunks = []

508

for op in delta_seq.get_opcodes():

509

if op[0] == 'equal':

510

continue

511

diff_hunks.append((op[1], op[2], op[4]-op[3], new_content._lines[op[3]:op[4]]))

512

return diff_hunks

513

514

def _get_component_versions(self, version_id):

515

basis = self.basis_knit

516

needed_versions = []

517

basis_versions = []

518

cursor = version_id

519

520

while 1:

521

picked_knit = self

522

if basis and basis._index.has_version(cursor):

523

picked_knit = basis

524

basis_versions.append(cursor)

525

method = picked_knit._index.get_method(cursor)

526

needed_versions.append((method, cursor))

527

if method == 'fulltext':

528

break

529

cursor = picked_knit.get_parents(cursor)[0]

530

return needed_versions, basis_versions

531

532

def _get_component_positions(self, version_id):

533

needed_versions, basis_versions = \

534

self._get_component_versions(version_id)

535

assert len(basis_versions) == 0

536

positions = []

537

for method, comp_id in needed_versions:

538

data_pos, data_size = self._index.get_position(comp_id)

539

positions.append((method, comp_id, data_pos, data_size))

540

return positions

541

542

def _get_components(self, version_id):

543

"""Return a list of (version_id, method, data) tuples that

544

makes up version specified by version_id of the knit.

545

546

The components should be applied in the order of the returned

547

list.

548

549

The basis knit will be used to the largest extent possible

550

since it is assumed that accesses to it is faster.

551

"""

552

#profile notes:

553

# 4168 calls in 14912, 2289 internal

554

# 4168 in 9711 to read_records

555

# 52554 in 1250 to get_parents

556

# 170166 in 865 to list.append

557

558

# needed_revisions holds a list of (method, version_id) of

559

# versions that is needed to be fetched to construct the final

560

# version of the file.

561

562

# basis_revisions is a list of versions that needs to be

563

# fetched but exists in the basis knit.

564

565

needed_versions, basis_versions = \

566

self._get_component_versions(version_id)

567

568

components = {}

569

if basis_versions:

570

assert True, "I am broken"

571

basis = self.basis_knit

572

records = []

573

for comp_id in basis_versions:

574

data_pos, data_size = basis._index.get_data_position(comp_id)

575

records.append((piece_id, data_pos, data_size))

576

components.update(basis._data.read_records(records))

577

578

records = []

579

for comp_id in [vid for method, vid in needed_versions

580

if vid not in basis_versions]:

581

data_pos, data_size = self._index.get_position(comp_id)

582

records.append((comp_id, data_pos, data_size))

583

components.update(self._data.read_records(records))

584

585

# get_data_records returns a mapping with the version id as

586

# index and the value as data. The order the components need

587

# to be applied is held by needed_versions (reversed).

588

out = []

589

for method, comp_id in reversed(needed_versions):

590

out.append((comp_id, method, components[comp_id]))

591

592

return out

593

594

def _get_content(self, version_id, parent_texts={}):

595

"""Returns a content object that makes up the specified

596

version."""

597

if not self.has_version(version_id):

598

raise RevisionNotPresent(version_id, self.filename)

599

600

cached_version = parent_texts.get(version_id, None)

601

if cached_version is not None:

602

return cached_version

603

604

if self.basis_knit and version_id in self.basis_knit:

605

return self.basis_knit._get_content(version_id)

606

607

content = None

608

components = self._get_components(version_id)

609

for component_id, method, (data, digest) in components:

610

version_idx = self._index.lookup(component_id)

611

if method == 'fulltext':

612

assert content is None

613

content = self.factory.parse_fulltext(data, version_idx)

614

elif method == 'line-delta':

615

delta = self.factory.parse_line_delta(data, version_idx)

616

content._lines = self._apply_delta(content._lines, delta)

617

618

if 'no-eol' in self._index.get_options(version_id):

619

line = content._lines[-1][1].rstrip('\n')

620

content._lines[-1] = (content._lines[-1][0], line)

621

622

# digest here is the digest from the last applied component.

623

if sha_strings(content.text()) != digest:

624

raise KnitCorrupt(self.filename, 'sha-1 does not match %s' % version_id)

625

626

return content

627

628

def _check_versions_present(self, version_ids):

629

"""Check that all specified versions are present."""

630

version_ids = set(version_ids)

631

for r in list(version_ids):

632

if self._index.has_version(r):

633

version_ids.remove(r)

634

if version_ids:

635

raise RevisionNotPresent(list(version_ids)[0], self.filename)

636

637

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

638

"""See VersionedFile.add_lines_with_ghosts()."""

639

self._check_add(version_id, lines)

640

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

641

642

def _add_lines(self, version_id, parents, lines, parent_texts):

643

"""See VersionedFile.add_lines."""

644

self._check_add(version_id, lines)

645

self._check_versions_present(parents)

646

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

647

648

def _check_add(self, version_id, lines):

649

"""check that version_id and lines are safe to add."""

650

assert self.writable, "knit is not opened for write"

651

### FIXME escape. RBC 20060228

652

if contains_whitespace(version_id):

653

raise InvalidRevisionId(version_id, self.filename)

654

if self.has_version(version_id):

655

raise RevisionAlreadyPresent(version_id, self.filename)

656

self._check_lines_not_unicode(lines)

657

self._check_lines_are_lines(lines)

658

659

def _add(self, version_id, lines, parents, delta, parent_texts):

660

"""Add a set of lines on top of version specified by parents.

661

662

If delta is true, compress the text as a line-delta against

663

the first parent.

664

665

Any versions not present will be converted into ghosts.

666

"""

667

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

668

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

669

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

670

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

671

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

672

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

673

# +1383 0 8.0370 8.0370 +<len>

674

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

675

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

676

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

677

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

678

679

present_parents = []

680

ghosts = []

681

if parent_texts is None:

682

parent_texts = {}

683

for parent in parents:

684

if not self.has_version(parent):

685

ghosts.append(parent)

686

else:

687

present_parents.append(parent)

688

689

if delta and not len(present_parents):

690

delta = False

691

692

digest = sha_strings(lines)

693

options = []

694

if lines:

695

if lines[-1][-1] != '\n':

696

options.append('no-eol')

697

lines[-1] = lines[-1] + '\n'

698

699

if len(present_parents) and delta:

700

# To speed the extract of texts the delta chain is limited

701

# to a fixed number of deltas. This should minimize both

702

# I/O and the time spend applying deltas.

703

count = 0

704

delta_parents = present_parents

705

while count < 25:

706

parent = delta_parents[0]

707

method = self._index.get_method(parent)

708

if method == 'fulltext':

709

break

710

delta_parents = self._index.get_parents(parent)

711

count = count + 1

712

if method == 'line-delta':

713

delta = False

714

715

lines = self.factory.make(lines, version_id)

716

if delta or (self.factory.annotated and len(present_parents) > 0):

717

# Merge annotations from parent texts if so is needed.

718

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

719

delta, self.factory.annotated)

720

721

if delta:

722

options.append('line-delta')

723

store_lines = self.factory.lower_line_delta(delta_hunks)

724

else:

725

options.append('fulltext')

726

store_lines = self.factory.lower_fulltext(lines)

727

728

where, size = self._data.add_record(version_id, digest, store_lines)

729

self._index.add_version(version_id, options, where, size, parents)

730

return lines

731

732

def check(self, progress_bar=None):

733

"""See VersionedFile.check()."""

734

735

def _clone_text(self, new_version_id, old_version_id, parents):

736

"""See VersionedFile.clone_text()."""

737

# FIXME RBC 20060228 make fast by only inserting an index with null

738

# delta.

739

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

740

741

def get_lines(self, version_id):

742

"""See VersionedFile.get_lines()."""

743

return self.get_line_list([version_id])[0]

744

745

def _get_version_components(self, position_map):

746

records = []

747

for version_id, positions in position_map.iteritems():

748

for method, comp_id, position, size in positions:

749

records.append((comp_id, position, size))

750

record_map = self._data.read_records(records)

751

752

component_map = {}

753

for version_id, positions in position_map.iteritems():

754

components = []

755

for method, comp_id, position, size in positions:

756

data, digest = record_map[comp_id]

757

components.append((comp_id, method, data, digest))

758

component_map[version_id] = components

759

return component_map

760

761

def get_text(self, version_id):

762

"""See VersionedFile.get_text"""

763

return self.get_texts([version_id])[0]

764

765

def get_texts(self, version_ids):

766

return [''.join(l) for l in self.get_line_list(version_ids)]

767

768

def get_line_list(self, version_ids):

769

"""Return the texts of listed versions as a list of strings."""

770

position_map = {}

771

for version_id in version_ids:

772

if not self.has_version(version_id):

773

raise RevisionNotPresent(version_id, self.filename)

774

position_map[version_id] = \

775

self._get_component_positions(version_id)

776

777

version_components = self._get_version_components(position_map).items()

778

779

text_map = {}

780

for version_id, components in version_components:

781

content = None

782

for component_id, method, data, digest in reversed(components):

783

version_idx = self._index.lookup(component_id)

784

if method == 'fulltext':

785

assert content is None

786

content = self.factory.parse_fulltext(data, version_idx)

787

elif method == 'line-delta':

788

delta = self.factory.parse_line_delta(data, version_idx)

789

content._lines = self._apply_delta(content._lines, delta)

790

791

if 'no-eol' in self._index.get_options(version_id):

792

line = content._lines[-1][1].rstrip('\n')

793

content._lines[-1] = (content._lines[-1][0], line)

794

795

# digest here is the digest from the last applied component.

796

if sha_strings(content.text()) != digest:

797

raise KnitCorrupt(self.filename,

798

'sha-1 does not match %s' % version_id)

799

800

text_map[version_id] = content.text()

801

return [text_map[v] for v in version_ids]

802

803

def iter_lines_added_or_present_in_versions(self, version_ids=None):

804

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

805

if version_ids is None:

806

version_ids = self.versions()

807

# we don't care about inclusions, the caller cares.

808

# but we need to setup a list of records to visit.

809

# we need version_id, position, length

810

version_id_records = []

811

requested_versions = list(version_ids)

812

# filter for available versions

813

for version_id in requested_versions:

814

if not self.has_version(version_id):

815

raise RevisionNotPresent(version_id, self.filename)

816

# get a in-component-order queue:

817

version_ids = []

818

for version_id in self.versions():

819

if version_id in requested_versions:

820

version_ids.append(version_id)

821

data_pos, length = self._index.get_position(version_id)

822

version_id_records.append((version_id, data_pos, length))

823

824

pb = bzrlib.ui.ui_factory.nested_progress_bar()

825

count = 0

826

total = len(version_id_records)

827

try:

828

pb.update('Walking content.', count, total)

829

for version_id, data, sha_value in \

830

self._data.read_records_iter(version_id_records):

831

pb.update('Walking content.', count, total)

832

method = self._index.get_method(version_id)

833

version_idx = self._index.lookup(version_id)

834

assert method in ('fulltext', 'line-delta')

835

if method == 'fulltext':

836

content = self.factory.parse_fulltext(data, version_idx)

837

for line in content.text():

838

yield line

839

else:

840

delta = self.factory.parse_line_delta(data, version_idx)

841

for start, end, count, lines in delta:

842

for origin, line in lines:

843

yield line

844

count +=1

845

pb.update('Walking content.', total, total)

846

pb.finished()

847

except:

848

pb.update('Walking content.', total, total)

849

pb.finished()

850

raise

851

852

def num_versions(self):

853

"""See VersionedFile.num_versions()."""

854

return self._index.num_versions()

855

856

__len__ = num_versions

857

858

def annotate_iter(self, version_id):

859

"""See VersionedFile.annotate_iter."""

860

content = self._get_content(version_id)

861

for origin, text in content.annotate_iter():

862

yield origin, text

863

864

def get_parents(self, version_id):

865

"""See VersionedFile.get_parents."""

866

# perf notes:

867

# optimism counts!

868

# 52554 calls in 1264 872 internal down from 3674

869

try:

870

return self._index.get_parents(version_id)

871

except KeyError:

872

raise RevisionNotPresent(version_id, self.filename)

873

874

def get_parents_with_ghosts(self, version_id):

875

"""See VersionedFile.get_parents."""

876

try:

877

return self._index.get_parents_with_ghosts(version_id)

878

except KeyError:

879

raise RevisionNotPresent(version_id, self.filename)

880

881

def get_ancestry(self, versions):

882

"""See VersionedFile.get_ancestry."""

883

if isinstance(versions, basestring):

884

versions = [versions]

885

if not versions:

886

return []

887

self._check_versions_present(versions)

888

return self._index.get_ancestry(versions)

889

890

def get_ancestry_with_ghosts(self, versions):

891

"""See VersionedFile.get_ancestry_with_ghosts."""

892

if isinstance(versions, basestring):

893

versions = [versions]

894

if not versions:

895

return []

896

self._check_versions_present(versions)

897

return self._index.get_ancestry_with_ghosts(versions)

898

899

#@deprecated_method(zero_eight)

900

def walk(self, version_ids):

901

"""See VersionedFile.walk."""

902

# We take the short path here, and extract all relevant texts

903

# and put them in a weave and let that do all the work. Far

904

# from optimal, but is much simpler.

905

# FIXME RB 20060228 this really is inefficient!

906

from bzrlib.weave import Weave

907

908

w = Weave(self.filename)

909

ancestry = self.get_ancestry(version_ids)

910

sorted_graph = topo_sort(self._index.get_graph())

911

version_list = [vid for vid in sorted_graph if vid in ancestry]

912

913

for version_id in version_list:

914

lines = self.get_lines(version_id)

915

w.add_lines(version_id, self.get_parents(version_id), lines)

916

917

for lineno, insert_id, dset, line in w.walk(version_ids):

918

yield lineno, insert_id, dset, line

919

920

def plan_merge(self, ver_a, ver_b):

921

"""See VersionedFile.plan_merge."""

922

ancestors_b = set(self.get_ancestry(ver_b))

923

def status_a(revision, text):

924

if revision in ancestors_b:

925

return 'killed-b', text

926

else:

927

return 'new-a', text

928

929

ancestors_a = set(self.get_ancestry(ver_a))

930

def status_b(revision, text):

931

if revision in ancestors_a:

932

return 'killed-a', text

933

else:

934

return 'new-b', text

935

936

annotated_a = self.annotate(ver_a)

937

annotated_b = self.annotate(ver_b)

938

plain_a = [t for (a, t) in annotated_a]

939

plain_b = [t for (a, t) in annotated_b]

940

blocks = KnitSequenceMatcher(None, plain_a, plain_b).get_matching_blocks()

941

a_cur = 0

942

b_cur = 0

943

for ai, bi, l in blocks:

944

# process all mismatched sections

945

# (last mismatched section is handled because blocks always

946

# includes a 0-length last block)

947

for revision, text in annotated_a[a_cur:ai]:

948

yield status_a(revision, text)

949

for revision, text in annotated_b[b_cur:bi]:

950

yield status_b(revision, text)

951

952

# and now the matched section

953

a_cur = ai + l

954

b_cur = bi + l

955

for text_a, text_b in zip(plain_a[ai:a_cur], plain_b[bi:b_cur]):

956

assert text_a == text_b

957

yield "unchanged", text_a

958

959

960

class _KnitComponentFile(object):

961

"""One of the files used to implement a knit database"""

962

963

def __init__(self, transport, filename, mode, file_mode=None):

964

self._transport = transport

965

self._filename = filename

966

self._mode = mode

967

self._file_mode=file_mode

968

969

def write_header(self):

970

if self._transport.append(self._filename, StringIO(self.HEADER),

971

mode=self._file_mode):

972

raise KnitCorrupt(self._filename, 'misaligned after writing header')

973

974

def check_header(self, fp):

975

line = fp.readline()

976

if line != self.HEADER:

977

raise KnitHeaderError(badline=line)

978

979

def commit(self):

980

"""Commit is a nop."""

981

982

def __repr__(self):

983

return '%s(%s)' % (self.__class__.__name__, self._filename)

984

985

986

class _KnitIndex(_KnitComponentFile):

987

"""Manages knit index file.

988

989

The index is already kept in memory and read on startup, to enable

990

fast lookups of revision information. The cursor of the index

991

file is always pointing to the end, making it easy to append

992

entries.

993

994

_cache is a cache for fast mapping from version id to a Index

995

object.

996

997

_history is a cache for fast mapping from indexes to version ids.

998

999

The index data format is dictionary compressed when it comes to

1000

parent references; a index entry may only have parents that with a

1001

lover index number. As a result, the index is topological sorted.

1002

1003

Duplicate entries may be written to the index for a single version id

1004

if this is done then the latter one completely replaces the former:

1005

this allows updates to correct version and parent information.

1006

Note that the two entries may share the delta, and that successive

1007

annotations and references MUST point to the first entry.

1008

1009

The index file on disc contains a header, followed by one line per knit

1010

record. The same revision can be present in an index file more than once.

1011

The first occurrence gets assigned a sequence number starting from 0.

1012

1013

The format of a single line is

1014

REVISION_ID FLAGS BYTE_OFFSET LENGTH( PARENT_ID|PARENT_SEQUENCE_ID)* :\n

1015

REVISION_ID is a utf8-encoded revision id

1016

FLAGS is a comma separated list of flags about the record. Values include

1017

no-eol, line-delta, fulltext.

1018

BYTE_OFFSET is the ascii representation of the byte offset in the data file

1019

that the the compressed data starts at.

1020

LENGTH is the ascii representation of the length of the data file.

1021

PARENT_ID a utf-8 revision id prefixed by a '.' that is a parent of

1022

REVISION_ID.

1023

PARENT_SEQUENCE_ID the ascii representation of the sequence number of a

1024

revision id already in the knit that is a parent of REVISION_ID.

1025

The ' :' marker is the end of record marker.

1026

1027

partial writes:

1028

when a write is interrupted to the index file, it will result in a line that

1029

does not end in ' :'. If the ' :' is not present at the end of a line, or at

1030

the end of the file, then the record that is missing it will be ignored by

1031

the parser.

1032

1033

When writing new records to the index file, the data is preceded by '\n'

1034

to ensure that records always start on new lines even if the last write was

1035

interrupted. As a result its normal for the last line in the index to be

1036

missing a trailing newline. One can be added with no harmful effects.

1037

"""

1038

1039

HEADER = "# bzr knit index 8\n"

1040

1041

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

1042

# __slots__ = ['_cache', '_history', '_transport', '_filename']

1043

1044

def _cache_version(self, version_id, options, pos, size, parents):

1045

"""Cache a version record in the history array and index cache.

1046

1047

This is inlined into __init__ for performance. KEEP IN SYNC.

1048

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

1049

indexes).

1050

"""

1051

# only want the _history index to reference the 1st index entry

1052

# for version_id

1053

if version_id not in self._cache:

1054

index = len(self._history)

1055

self._history.append(version_id)

1056

else:

1057

index = self._cache[version_id][5]

1058

self._cache[version_id] = (version_id,

1059

options,

1060

pos,

1061

size,

1062

parents,

1063

index)

1064

1065

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1066

_KnitComponentFile.__init__(self, transport, filename, mode, file_mode)

1067

self._cache = {}

1068

# position in _history is the 'official' index for a revision

1069

# but the values may have come from a newer entry.

1070

# so - wc -l of a knit index is != the number of unique names

1071

# in the weave.

1072

self._history = []

1073

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1074

try:

1075

count = 0

1076

total = 1

1077

try:

1078

pb.update('read knit index', count, total)

1079

fp = self._transport.get(self._filename)

1080

self.check_header(fp)

1081

# readlines reads the whole file at once:

1082

# bad for transports like http, good for local disk

1083

# we save 60 ms doing this one change (

1084

# from calling readline each time to calling

1085

# readlines once.

1086

# probably what we want for nice behaviour on

1087

# http is a incremental readlines that yields, or

1088

# a check for local vs non local indexes,

1089

for l in fp.readlines():

1090

rec = l.split()

1091

if len(rec) < 5 or rec[-1] != ':':

1092

# corrupt line.

1093

# FIXME: in the future we should determine if its a

1094

# short write - and ignore it

1095

# or a different failure, and raise. RBC 20060407

1096

continue

1097

count += 1

1098

total += 1

1099

#pb.update('read knit index', count, total)

1100

# See self._parse_parents

1101

parents = []

1102

for value in rec[4:-1]:

1103

if '.' == value[0]:

1104

# uncompressed reference

1105

parents.append(value[1:])

1106

else:

1107

# this is 15/4000ms faster than isinstance,

1108

# (in lsprof)

1109

# this function is called thousands of times a

1110

# second so small variations add up.

1111

assert value.__class__ is str

1112

parents.append(self._history[int(value)])

1113

# end self._parse_parents

1114

# self._cache_version(rec[0],

1115

# rec[1].split(','),

1116

# int(rec[2]),

1117

# int(rec[3]),

1118

# parents)

1119

# --- self._cache_version

1120

# only want the _history index to reference the 1st

1121

# index entry for version_id

1122

version_id = rec[0]

1123

if version_id not in self._cache:

1124

index = len(self._history)

1125

self._history.append(version_id)

1126

else:

1127

index = self._cache[version_id][5]

1128

self._cache[version_id] = (version_id,

1129

rec[1].split(','),

1130

int(rec[2]),

1131

int(rec[3]),

1132

parents,

1133

index)

1134

# --- self._cache_version

1135

except NoSuchFile, e:

1136

if mode != 'w' or not create:

1137

raise

1138

self.write_header()

1139

finally:

1140

pb.update('read knit index', total, total)

1141

pb.finished()

1142

1143

def _parse_parents(self, compressed_parents):

1144

"""convert a list of string parent values into version ids.

1145

1146

ints are looked up in the index.

1147

.FOO values are ghosts and converted in to FOO.

1148

1149

NOTE: the function is retained here for clarity, and for possible

1150

use in partial index reads. However bulk processing now has

1151

it inlined in __init__ for inner-loop optimisation.

1152

"""

1153

result = []

1154

for value in compressed_parents:

1155

if value[-1] == '.':

1156

# uncompressed reference

1157

result.append(value[1:])

1158

else:

1159

# this is 15/4000ms faster than isinstance,

1160

# this function is called thousands of times a

1161

# second so small variations add up.

1162

assert value.__class__ is str

1163

result.append(self._history[int(value)])

1164

return result

1165

1166

def get_graph(self):

1167

graph = []

1168

for version_id, index in self._cache.iteritems():

1169

graph.append((version_id, index[4]))

1170

return graph

1171

1172

def get_ancestry(self, versions):

1173

"""See VersionedFile.get_ancestry."""

1174

# get a graph of all the mentioned versions:

1175

graph = {}

1176

pending = set(versions)

1177

while len(pending):

1178

version = pending.pop()

1179

parents = self._cache[version][4]

1180

# got the parents ok

1181

# trim ghosts

1182

parents = [parent for parent in parents if parent in self._cache]

1183

for parent in parents:

1184

# if not completed and not a ghost

1185

if parent not in graph:

1186

pending.add(parent)

1187

graph[version] = parents

1188

return topo_sort(graph.items())

1189

1190

def get_ancestry_with_ghosts(self, versions):

1191

"""See VersionedFile.get_ancestry_with_ghosts."""

1192

# get a graph of all the mentioned versions:

1193

graph = {}

1194

pending = set(versions)

1195

while len(pending):

1196

version = pending.pop()

1197

try:

1198

parents = self._cache[version][4]

1199

except KeyError:

1200

# ghost, fake it

1201

graph[version] = []

1202

pass

1203

else:

1204

# got the parents ok

1205

for parent in parents:

1206

if parent not in graph:

1207

pending.add(parent)

1208

graph[version] = parents

1209

return topo_sort(graph.items())

1210

1211

def num_versions(self):

1212

return len(self._history)

1213

1214

__len__ = num_versions

1215

1216

def get_versions(self):

1217

return self._history

1218

1219

def idx_to_name(self, idx):

1220

return self._history[idx]

1221

1222

def lookup(self, version_id):

1223

assert version_id in self._cache

1224

return self._cache[version_id][5]

1225

1226

def _version_list_to_index(self, versions):

1227

result_list = []

1228

for version in versions:

1229

if version in self._cache:

1230

# -- inlined lookup() --

1231

result_list.append(str(self._cache[version][5]))

1232

# -- end lookup () --

1233

else:

1234

result_list.append('.' + version.encode('utf-8'))

1235

return ' '.join(result_list)

1236

1237

def add_version(self, version_id, options, pos, size, parents):

1238

"""Add a version record to the index."""

1239

self.add_versions(((version_id, options, pos, size, parents),))

1240

1241

def add_versions(self, versions):

1242

"""Add multiple versions to the index.

1243

1244

:param versions: a list of tuples:

1245

(version_id, options, pos, size, parents).

1246

"""

1247

lines = []

1248

for version_id, options, pos, size, parents in versions:

1249

line = "\n%s %s %s %s %s :" % (version_id.encode('utf-8'),

1250

','.join(options),

1251

pos,

1252

size,

1253

self._version_list_to_index(parents))

1254

assert isinstance(line, str), \

1255

'content must be utf-8 encoded: %r' % (line,)

1256

lines.append(line)

1257

self._transport.append(self._filename, StringIO(''.join(lines)))

1258

# cache after writing, so that a failed write leads to missing cache

1259

# entries not extra ones. XXX TODO: RBC 20060502 in the event of a

1260

# failure, reload the index or flush it or some such, to prevent

1261

# writing records that did complete twice.

1262

for version_id, options, pos, size, parents in versions:

1263

self._cache_version(version_id, options, pos, size, parents)

1264

1265

def has_version(self, version_id):

1266

"""True if the version is in the index."""

1267

return self._cache.has_key(version_id)

1268

1269

def get_position(self, version_id):

1270

"""Return data position and size of specified version."""

1271

return (self._cache[version_id][2], \

1272

self._cache[version_id][3])

1273

1274

def get_method(self, version_id):

1275

"""Return compression method of specified version."""

1276

options = self._cache[version_id][1]

1277

if 'fulltext' in options:

1278

return 'fulltext'

1279

else:

1280

assert 'line-delta' in options

1281

return 'line-delta'

1282

1283

def get_options(self, version_id):

1284

return self._cache[version_id][1]

1285

1286

def get_parents(self, version_id):

1287

"""Return parents of specified version ignoring ghosts."""

1288

return [parent for parent in self._cache[version_id][4]

1289

if parent in self._cache]

1290

1291

def get_parents_with_ghosts(self, version_id):

1292

"""Return parents of specified version with ghosts."""

1293

return self._cache[version_id][4]

1294

1295

def check_versions_present(self, version_ids):

1296

"""Check that all specified versions are present."""

1297

version_ids = set(version_ids)

1298

for version_id in list(version_ids):

1299

if version_id in self._cache:

1300

version_ids.remove(version_id)

1301

if version_ids:

1302

raise RevisionNotPresent(list(version_ids)[0], self.filename)

1303

1304

1305

class _KnitData(_KnitComponentFile):

1306

"""Contents of the knit data file"""

1307

1308

HEADER = "# bzr knit data 8\n"

1309

1310

def __init__(self, transport, filename, mode, create=False, file_mode=None):

1311

_KnitComponentFile.__init__(self, transport, filename, mode)

1312

self._file = None

1313

self._checked = False

1314

if create:

1315

self._transport.put(self._filename, StringIO(''), mode=file_mode)

1316

self._records = {}

1317

1318

def clear_cache(self):

1319

"""Clear the record cache."""

1320

self._records = {}

1321

1322

def _open_file(self):

1323

if self._file is None:

1324

try:

1325

self._file = self._transport.get(self._filename)

1326

except NoSuchFile:

1327

pass

1328

return self._file

1329

1330

def _record_to_data(self, version_id, digest, lines):

1331

"""Convert version_id, digest, lines into a raw data block.

1332

1333

:return: (len, a StringIO instance with the raw data ready to read.)

1334

"""

1335

sio = StringIO()

1336

data_file = GzipFile(None, mode='wb', fileobj=sio)

1337

data_file.writelines(chain(

1338

["version %s %d %s\n" % (version_id.encode('utf-8'),

1339

len(lines),

1340

digest)],

1341

lines,

1342

["end %s\n" % version_id.encode('utf-8')]))

1343

data_file.close()

1344

length= sio.tell()

1345

1346

sio.seek(0)

1347

return length, sio

1348

1349

def add_raw_record(self, raw_data):

1350

"""Append a prepared record to the data file.

1351

1352

:return: the offset in the data file raw_data was written.

1353

"""

1354

assert isinstance(raw_data, str), 'data must be plain bytes'

1355

return self._transport.append(self._filename, StringIO(raw_data))

1356

1357

def add_record(self, version_id, digest, lines):

1358

"""Write new text record to disk. Returns the position in the

1359

file where it was written."""

1360

size, sio = self._record_to_data(version_id, digest, lines)

1361

# cache

1362

self._records[version_id] = (digest, lines)

1363

# write to disk

1364

start_pos = self._transport.append(self._filename, sio)

1365

return start_pos, size

1366

1367

def _parse_record_header(self, version_id, raw_data):

1368

"""Parse a record header for consistency.

1369

1370

:return: the header and the decompressor stream.

1371

as (stream, header_record)

1372

"""

1373

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1374

rec = df.readline().split()

1375

if len(rec) != 4:

1376

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1377

if rec[1].decode('utf-8')!= version_id:

1378

raise KnitCorrupt(self._filename,

1379

'unexpected version, wanted %r, got %r' % (

1380

version_id, rec[1]))

1381

return df, rec

1382

1383

def _parse_record(self, version_id, data):

1384

# profiling notes:

1385

# 4168 calls in 2880 217 internal

1386

# 4168 calls to _parse_record_header in 2121

1387

# 4168 calls to readlines in 330

1388

df, rec = self._parse_record_header(version_id, data)

1389

record_contents = df.readlines()

1390

l = record_contents.pop()

1391

assert len(record_contents) == int(rec[2])

1392

if l.decode('utf-8') != 'end %s\n' % version_id:

1393

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1394

% (l, version_id))

1395

df.close()

1396

return record_contents, rec[3]

1397

1398

def read_records_iter_raw(self, records):

1399

"""Read text records from data file and yield raw data.

1400

1401

This unpacks enough of the text record to validate the id is

1402

as expected but thats all.

1403

1404

It will actively recompress currently cached records on the

1405

basis that that is cheaper than I/O activity.

1406

"""

1407

needed_records = []

1408

for version_id, pos, size in records:

1409

if version_id not in self._records:

1410

needed_records.append((version_id, pos, size))

1411

1412

# setup an iterator of the external records:

1413

# uses readv so nice and fast we hope.

1414

if len(needed_records):

1415

# grab the disk data needed.

1416

raw_records = self._transport.readv(self._filename,

1417

[(pos, size) for version_id, pos, size in needed_records])

1418

1419

for version_id, pos, size in records:

1420

if version_id in self._records:

1421

# compress a new version

1422

size, sio = self._record_to_data(version_id,

1423

self._records[version_id][0],

1424

self._records[version_id][1])

1425

yield version_id, sio.getvalue()

1426

else:

1427

pos, data = raw_records.next()

1428

# validate the header

1429

df, rec = self._parse_record_header(version_id, data)

1430

df.close()

1431

yield version_id, data

1432

1433

1434

def read_records_iter(self, records):

1435

"""Read text records from data file and yield result.

1436

1437

Each passed record is a tuple of (version_id, pos, len) and

1438

will be read in the given order. Yields (version_id,

1439

contents, digest).

1440

"""

1441

# profiling notes:

1442

# 60890 calls for 4168 extractions in 5045, 683 internal.

1443

# 4168 calls to readv in 1411

1444

# 4168 calls to parse_record in 2880

1445

1446

needed_records = []

1447

for version_id, pos, size in records:

1448

if version_id not in self._records:

1449

needed_records.append((version_id, pos, size))

1450

1451

if len(needed_records):

1452

needed_records.sort(key=lambda x:x[1])

1453

# We take it that the transport optimizes the fetching as good

1454

# as possible (ie, reads continuous ranges.)

1455

response = self._transport.readv(self._filename,

1456

[(pos, size) for version_id, pos, size in needed_records])

1457

1458

for (record_id, pos, size), (pos, data) in \

1459

izip(iter(needed_records), response):

1460

content, digest = self._parse_record(record_id, data)

1461

self._records[record_id] = (digest, content)

1462

1463

for version_id, pos, size in records:

1464

yield version_id, list(self._records[version_id][1]), self._records[version_id][0]

1465

1466

def read_records(self, records):

1467

"""Read records into a dictionary."""

1468

components = {}

1469

for record_id, content, digest in self.read_records_iter(records):

1470

components[record_id] = (content, digest)

1471

return components

1472

1473

1474

class InterKnit(InterVersionedFile):

1475

"""Optimised code paths for knit to knit operations."""

1476

1477

_matching_file_from_factory = KnitVersionedFile

1478

_matching_file_to_factory = KnitVersionedFile

1479

1480

@staticmethod

1481

def is_compatible(source, target):

1482

"""Be compatible with knits. """

1483

try:

1484

return (isinstance(source, KnitVersionedFile) and

1485

isinstance(target, KnitVersionedFile))

1486

except AttributeError:

1487

return False

1488

1489

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1490

"""See InterVersionedFile.join."""

1491

assert isinstance(self.source, KnitVersionedFile)

1492

assert isinstance(self.target, KnitVersionedFile)

1493

1494

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1495

1496

if not version_ids:

1497

return 0

1498

1499

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1500

try:

1501

version_ids = list(version_ids)

1502

if None in version_ids:

1503

version_ids.remove(None)

1504

1505

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1506

this_versions = set(self.target._index.get_versions())

1507

needed_versions = self.source_ancestry - this_versions

1508

cross_check_versions = self.source_ancestry.intersection(this_versions)

1509

mismatched_versions = set()

1510

for version in cross_check_versions:

1511

# scan to include needed parents.

1512

n1 = set(self.target.get_parents_with_ghosts(version))

1513

n2 = set(self.source.get_parents_with_ghosts(version))

1514

if n1 != n2:

1515

# FIXME TEST this check for cycles being introduced works

1516

# the logic is we have a cycle if in our graph we are an

1517

# ancestor of any of the n2 revisions.

1518

for parent in n2:

1519

if parent in n1:

1520

# safe

1521

continue

1522

else:

1523

parent_ancestors = self.source.get_ancestry(parent)

1524

if version in parent_ancestors:

1525

raise errors.GraphCycleError([parent, version])

1526

# ensure this parent will be available later.

1527

new_parents = n2.difference(n1)

1528

needed_versions.update(new_parents.difference(this_versions))

1529

mismatched_versions.add(version)

1530

1531

if not needed_versions and not mismatched_versions:

1532

return 0

1533

full_list = topo_sort(self.source.get_graph())

1534

1535

version_list = [i for i in full_list if (not self.target.has_version(i)

1536

and i in needed_versions)]

1537

1538

# plan the join:

1539

copy_queue = []

1540

copy_queue_records = []

1541

copy_set = set()

1542

for version_id in version_list:

1543

options = self.source._index.get_options(version_id)

1544

parents = self.source._index.get_parents_with_ghosts(version_id)

1545

# check that its will be a consistent copy:

1546

for parent in parents:

1547

# if source has the parent, we must :

1548

# * already have it or

1549

# * have it scheduled already

1550

# otherwise we don't care

1551

assert (self.target.has_version(parent) or

1552

parent in copy_set or

1553

not self.source.has_version(parent))

1554

data_pos, data_size = self.source._index.get_position(version_id)

1555

copy_queue_records.append((version_id, data_pos, data_size))

1556

copy_queue.append((version_id, options, parents))

1557

copy_set.add(version_id)

1558

1559

# data suck the join:

1560

count = 0

1561

total = len(version_list)

1562

raw_datum = []

1563

raw_records = []

1564

for (version_id, raw_data), \

1565

(version_id2, options, parents) in \

1566

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1567

copy_queue):

1568

assert version_id == version_id2, 'logic error, inconsistent results'

1569

count = count + 1

1570

pb.update("Joining knit", count, total)

1571

raw_records.append((version_id, options, parents, len(raw_data)))

1572

raw_datum.append(raw_data)

1573

self.target._add_raw_records(raw_records, ''.join(raw_datum))

1574

1575

for version in mismatched_versions:

1576

# FIXME RBC 20060309 is this needed?

1577

n1 = set(self.target.get_parents_with_ghosts(version))

1578

n2 = set(self.source.get_parents_with_ghosts(version))

1579

# write a combined record to our history preserving the current

1580

# parents as first in the list

1581

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1582

self.target.fix_parents(version, new_parents)

1583

return count

1584

finally:

1585

pb.finished()

1586

1587

1588

InterVersionedFile.register_optimiser(InterKnit)

1589

1590

1591

class WeaveToKnit(InterVersionedFile):

1592

"""Optimised code paths for weave to knit operations."""

1593

1594

_matching_file_from_factory = bzrlib.weave.WeaveFile

1595

_matching_file_to_factory = KnitVersionedFile

1596

1597

@staticmethod

1598

def is_compatible(source, target):

1599

"""Be compatible with weaves to knits."""

1600

try:

1601

return (isinstance(source, bzrlib.weave.Weave) and

1602

isinstance(target, KnitVersionedFile))

1603

except AttributeError:

1604

return False

1605

1606

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1607

"""See InterVersionedFile.join."""

1608

assert isinstance(self.source, bzrlib.weave.Weave)

1609

assert isinstance(self.target, KnitVersionedFile)

1610

1611

version_ids = self._get_source_version_ids(version_ids, ignore_missing)

1612

1613

if not version_ids:

1614

return 0

1615

1616

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1617

try:

1618

version_ids = list(version_ids)

1619

1620

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1621

this_versions = set(self.target._index.get_versions())

1622

needed_versions = self.source_ancestry - this_versions

1623

cross_check_versions = self.source_ancestry.intersection(this_versions)

1624

mismatched_versions = set()

1625

for version in cross_check_versions:

1626

# scan to include needed parents.

1627

n1 = set(self.target.get_parents_with_ghosts(version))

1628

n2 = set(self.source.get_parents(version))

1629

# if all of n2's parents are in n1, then its fine.

1630

if n2.difference(n1):

1631

# FIXME TEST this check for cycles being introduced works

1632

# the logic is we have a cycle if in our graph we are an

1633

# ancestor of any of the n2 revisions.

1634

for parent in n2:

1635

if parent in n1:

1636

# safe

1637

continue

1638

else:

1639

parent_ancestors = self.source.get_ancestry(parent)

1640

if version in parent_ancestors:

1641

raise errors.GraphCycleError([parent, version])

1642

# ensure this parent will be available later.

1643

new_parents = n2.difference(n1)

1644

needed_versions.update(new_parents.difference(this_versions))

1645

mismatched_versions.add(version)

1646

1647

if not needed_versions and not mismatched_versions:

1648

return 0

1649

full_list = topo_sort(self.source.get_graph())

1650

1651

version_list = [i for i in full_list if (not self.target.has_version(i)

1652

and i in needed_versions)]

1653

1654

# do the join:

1655

count = 0

1656

total = len(version_list)

1657

for version_id in version_list:

1658

pb.update("Converting to knit", count, total)

1659

parents = self.source.get_parents(version_id)

1660

# check that its will be a consistent copy:

1661

for parent in parents:

1662

# if source has the parent, we must already have it

1663

assert (self.target.has_version(parent))

1664

self.target.add_lines(

1665

version_id, parents, self.source.get_lines(version_id))

1666

count = count + 1

1667

1668

for version in mismatched_versions:

1669

# FIXME RBC 20060309 is this needed?

1670

n1 = set(self.target.get_parents_with_ghosts(version))

1671

n2 = set(self.source.get_parents(version))

1672

# write a combined record to our history preserving the current

1673

# parents as first in the list

1674

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1675

self.target.fix_parents(version, new_parents)

1676

return count

1677

finally:

1678

pb.finished()

1679

1680

1681

InterVersionedFile.register_optimiser(WeaveToKnit)

1682

1683

1684

class KnitSequenceMatcher(difflib.SequenceMatcher):

1685

"""Knit tuned sequence matcher.

1686

1687

This is based on profiling of difflib which indicated some improvements

1688

for our usage pattern.

1689

"""

1690

1691

def find_longest_match(self, alo, ahi, blo, bhi):

1692

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1693

1694

If isjunk is not defined:

1695

1696

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1697

alo <= i <= i+k <= ahi

1698

blo <= j <= j+k <= bhi

1699

and for all (i',j',k') meeting those conditions,

1700

k >= k'

1701

i <= i'

1702

and if i == i', j <= j'

1703

1704

In other words, of all maximal matching blocks, return one that

1705

starts earliest in a, and of all those maximal matching blocks that

1706

start earliest in a, return the one that starts earliest in b.

1707

1708

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1709

>>> s.find_longest_match(0, 5, 0, 9)

1710

(0, 4, 5)

1711

1712

If isjunk is defined, first the longest matching block is

1713

determined as above, but with the additional restriction that no

1714

junk element appears in the block. Then that block is extended as

1715

far as possible by matching (only) junk elements on both sides. So

1716

the resulting block never matches on junk except as identical junk

1717

happens to be adjacent to an "interesting" match.

1718

1719

Here's the same example as before, but considering blanks to be

1720

junk. That prevents " abcd" from matching the " abcd" at the tail

1721

end of the second sequence directly. Instead only the "abcd" can

1722

match, and matches the leftmost "abcd" in the second sequence:

1723

1724

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1725

>>> s.find_longest_match(0, 5, 0, 9)

1726

(1, 0, 4)

1727

1728

If no blocks match, return (alo, blo, 0).

1729

1730

>>> s = SequenceMatcher(None, "ab", "c")

1731

>>> s.find_longest_match(0, 2, 0, 1)

1732

(0, 0, 0)

1733

"""

1734

1735

# CAUTION: stripping common prefix or suffix would be incorrect.

1736

# E.g.,

1737

# ab

1738

# acab

1739

# Longest matching block is "ab", but if common prefix is

1740

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1741

# strip, so ends up claiming that ab is changed to acab by

1742

# inserting "ca" in the middle. That's minimal but unintuitive:

1743

# "it's obvious" that someone inserted "ac" at the front.

1744

# Windiff ends up at the same place as diff, but by pairing up

1745

# the unique 'b's and then matching the first two 'a's.

1746

1747

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1748

besti, bestj, bestsize = alo, blo, 0

1749

# find longest junk-free match

1750

# during an iteration of the loop, j2len[j] = length of longest

1751

# junk-free match ending with a[i-1] and b[j]

1752

j2len = {}

1753

# nothing = []

1754

b2jget = b2j.get

1755

for i in xrange(alo, ahi):

1756

# look at all instances of a[i] in b; note that because

1757

# b2j has no junk keys, the loop is skipped if a[i] is junk

1758

j2lenget = j2len.get

1759

newj2len = {}

1760

1761

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

1762

# following improvement

1763

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1764

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1765

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1766

# to

1767

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1768

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1769

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1770

1771

try:

1772

js = b2j[a[i]]

1773

except KeyError:

1774

pass

1775

else:

1776

for j in js:

1777

# a[i] matches b[j]

1778

if j >= blo:

1779

if j >= bhi:

1780

break

1781

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1782

if k > bestsize:

1783

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1784

j2len = newj2len

1785

1786

# Extend the best by non-junk elements on each end. In particular,

1787

# "popular" non-junk elements aren't in b2j, which greatly speeds

1788

# the inner loop above, but also means "the best" match so far

1789

# doesn't contain any junk *or* popular non-junk elements.

1790

while besti > alo and bestj > blo and \

1791

not isbjunk(b[bestj-1]) and \

1792

a[besti-1] == b[bestj-1]:

1793

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1794

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1795

not isbjunk(b[bestj+bestsize]) and \

1796

a[besti+bestsize] == b[bestj+bestsize]:

1797

bestsize += 1

1798

1799

# Now that we have a wholly interesting match (albeit possibly

1800

# empty!), we may as well suck up the matching junk on each

1801

# side of it too. Can't think of a good reason not to, and it

1802

# saves post-processing the (possibly considerable) expense of

1803

# figuring out what to do with it. In the case of an empty

1804

# interesting match, this is clearly the right thing to do,

1805

# because no other kind of match is possible in the regions.

1806

while besti > alo and bestj > blo and \

1807

isbjunk(b[bestj-1]) and \

1808

a[besti-1] == b[bestj-1]:

1809

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1810

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1811

isbjunk(b[bestj+bestsize]) and \

1812

a[besti+bestsize] == b[bestj+bestsize]:

1813

bestsize = bestsize + 1

1814

1815

return besti, bestj, bestsize

1816

Older »