/brz/remove-bazaar : revision 1596.2.35

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-03-12 17:09:11 UTC
mto: (1615.1.2 bzr.mbp.integration)
mto: This revision was merged to the branch mainline in revision 1616.
Revision ID: robertc@robertcollins.net-20060312170911-306a47e0478ec183

Subclass SequenceMatcher to get a slightly faster (in our case) find_longest_match routine.

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_reweave.py.moved

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_uncommit.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/transport/http.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

generate_docs.py

notes

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
.bzrignore

COPYING

INSTALL

Makefile

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

commands.py

converter.py

dir.py

errors.py

fetch.py

foreign

foreign/.bzrignore

foreign/TODO

foreign/__init__.py

foreign/test_versionedfiles.py

foreign/upgrade.py

foreign/versionedfiles.py

mapping.py

notes

notes/roundtripping.txt

remote.py

repository.py

revspec.py

server.py

setup.py

shamap.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_dir.py

tests/test_fetch.py

tests/test_ids.py

tests/test_repository.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

# Written by Martin Pool.

# Modified by Johan Rydberg <jrydberg@gnu.org>

# Modified by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Knit versionedfile implementation.

A knit is a versioned file implementation that supports efficient append only

updates.

Knit file layout:

lifeless: the data file is made up of "delta records". each delta record has a delta header

that contains; (1) a version id, (2) the size of the delta (in lines), and (3) the digest of

the -expanded data- (ie, the delta applied to the parent). the delta also ends with a

end-marker; simply "end VERSION"

delta can be line or full contents.a

... the 8's there are the index number of the annotation.

version robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad 7 c7d23b2a5bd6ca00e8e266cec0ec228158ee9f9e

59,59,3

8 if ie.executable:

8 e.set('executable', 'yes')

130,130,2

8 if elt.get('executable') == 'yes':

8 ie.executable = True

end robertc@robertcollins.net-20051003014215-ee2990904cc4c7ad

whats in an index:

09:33 < jrydberg> lifeless: each index is made up of a tuple of; version id, options, position, size, parents

09:33 < jrydberg> lifeless: the parents are currently dictionary compressed

09:33 < jrydberg> lifeless: (meaning it currently does not support ghosts)

09:33 < lifeless> right

09:33 < jrydberg> lifeless: the position and size is the range in the data file

so the index sequence is the dictionary compressed sequence number used

in the deltas to provide line annotation

"""

# TODOS:

# 10:16 < lifeless> make partial index writes safe

# 10:16 < lifeless> implement 'knit.check()' like weave.check()

# 10:17 < lifeless> record known ghosts so we can detect when they are filled in rather than the current 'reweave

# always' approach.

# move sha1 out of the content so that join is faster at verifying parents

# record content length ?

from copy import copy

from cStringIO import StringIO

import difflib

import gzip

from itertools import izip, chain

import os

import bzrlib

import bzrlib.errors as errors

from bzrlib.errors import FileExists, NoSuchFile, KnitError, \

InvalidRevisionId, KnitCorrupt, KnitHeaderError, \

RevisionNotPresent, RevisionAlreadyPresent

from bzrlib.trace import mutter

from bzrlib.osutils import contains_whitespace, contains_linebreaks, \

sha_strings

from bzrlib.versionedfile import VersionedFile, InterVersionedFile

from bzrlib.tsort import topo_sort

# TODO: Split out code specific to this format into an associated object.

# TODO: Can we put in some kind of value to check that the index and data

# files belong together?

# TODO: accomodate binaries, perhaps by storing a byte count

# TODO: function to check whole file

# TODO: atomically append data, then measure backwards from the cursor

# position after writing to work out where it was located. we may need to

# bypass python file buffering.

DATA_SUFFIX = '.knit'

100

INDEX_SUFFIX = '.kndx'

101

102

103

class KnitContent(object):

104

"""Content of a knit version to which deltas can be applied."""

105

106

def __init__(self, lines):

107

self._lines = lines

108

109

def annotate_iter(self):

110

"""Yield tuples of (origin, text) for each content line."""

111

for origin, text in self._lines:

112

yield origin, text

113

114

def annotate(self):

115

"""Return a list of (origin, text) tuples."""

116

return list(self.annotate_iter())

117

118

def apply_delta(self, delta):

119

"""Apply delta to this content."""

120

offset = 0

121

for start, end, count, lines in delta:

122

self._lines[offset+start:offset+end] = lines

123

offset = offset + (start - end) + count

124

125

def line_delta_iter(self, new_lines):

126

"""Generate line-based delta from this content to new_lines."""

127

new_texts = [text for origin, text in new_lines._lines]

128

old_texts = [text for origin, text in self._lines]

129

s = SequenceMatcher(None, old_texts, new_texts)

130

for op in s.get_opcodes():

131

if op[0] == 'equal':

132

continue

133

yield (op[1], op[2], op[4]-op[3], new_lines._lines[op[3]:op[4]])

134

135

def line_delta(self, new_lines):

136

return list(self.line_delta_iter(new_lines))

137

138

def text(self):

139

return [text for origin, text in self._lines]

140

141

142

class _KnitFactory(object):

143

"""Base factory for creating content objects."""

144

145

def make(self, lines, version):

146

num_lines = len(lines)

147

return KnitContent(zip([version] * num_lines, lines))

148

149

150

class KnitAnnotateFactory(_KnitFactory):

151

"""Factory for creating annotated Content objects."""

152

153

annotated = True

154

155

def parse_fulltext(self, content, version):

156

"""Convert fulltext to internal representation

157

158

fulltext content is of the format

159

revid(utf8) plaintext\n

160

internal representation is of the format:

161

(revid, plaintext)

162

"""

163

lines = []

164

for line in content:

165

origin, text = line.split(' ', 1)

166

lines.append((origin.decode('utf-8'), text))

167

return KnitContent(lines)

168

169

def parse_line_delta_iter(self, lines):

170

"""Convert a line based delta into internal representation.

171

172

line delta is in the form of:

173

intstart intend intcount

174

1..count lines:

175

revid(utf8) newline\n

176

internal represnetation is

177

(start, end, count, [1..count tuples (revid, newline)])

178

"""

179

while lines:

180

header = lines.pop(0)

181

start, end, c = [int(n) for n in header.split(',')]

182

contents = []

183

for i in range(c):

184

origin, text = lines.pop(0).split(' ', 1)

185

contents.append((origin.decode('utf-8'), text))

186

yield start, end, c, contents

187

188

def parse_line_delta(self, lines, version):

189

return list(self.parse_line_delta_iter(lines))

190

191

def lower_fulltext(self, content):

192

"""convert a fulltext content record into a serializable form.

193

194

see parse_fulltext which this inverts.

195

"""

196

return ['%s %s' % (o.encode('utf-8'), t) for o, t in content._lines]

197

198

def lower_line_delta(self, delta):

199

"""convert a delta into a serializable form.

200

201

See parse_line_delta_iter which this inverts.

202

"""

203

out = []

204

for start, end, c, lines in delta:

205

out.append('%d,%d,%d\n' % (start, end, c))

206

for origin, text in lines:

207

out.append('%s %s' % (origin.encode('utf-8'), text))

208

return out

209

210

211

class KnitPlainFactory(_KnitFactory):

212

"""Factory for creating plain Content objects."""

213

214

annotated = False

215

216

def parse_fulltext(self, content, version):

217

"""This parses an unannotated fulltext.

218

219

Note that this is not a noop - the internal representation

220

has (versionid, line) - its just a constant versionid.

221

"""

222

return self.make(content, version)

223

224

def parse_line_delta_iter(self, lines, version):

225

while lines:

226

header = lines.pop(0)

227

start, end, c = [int(n) for n in header.split(',')]

228

yield start, end, c, zip([version] * c, lines[:c])

229

del lines[:c]

230

231

def parse_line_delta(self, lines, version):

232

return list(self.parse_line_delta_iter(lines, version))

233

234

def lower_fulltext(self, content):

235

return content.text()

236

237

def lower_line_delta(self, delta):

238

out = []

239

for start, end, c, lines in delta:

240

out.append('%d,%d,%d\n' % (start, end, c))

241

out.extend([text for origin, text in lines])

242

return out

243

244

245

def make_empty_knit(transport, relpath):

246

"""Construct a empty knit at the specified location."""

247

k = KnitVersionedFile(transport, relpath, 'w', KnitPlainFactory)

248

k._data._open_file()

249

250

251

class KnitVersionedFile(VersionedFile):

252

"""Weave-like structure with faster random access.

253

254

A knit stores a number of texts and a summary of the relationships

255

between them. Texts are identified by a string version-id. Texts

256

are normally stored and retrieved as a series of lines, but can

257

also be passed as single strings.

258

259

Lines are stored with the trailing newline (if any) included, to

260

avoid special cases for files with no final newline. Lines are

261

composed of 8-bit characters, not unicode. The combination of

262

these approaches should mean any 'binary' file can be safely

263

stored and retrieved.

264

"""

265

266

def __init__(self, relpath, transport, file_mode=None, access_mode=None, factory=None,

267

basis_knit=None, delta=True, create=False):

268

"""Construct a knit at location specified by relpath.

269

270

:param create: If not True, only open an existing knit.

271

"""

272

if access_mode is None:

273

access_mode = 'w'

274

super(KnitVersionedFile, self).__init__(access_mode)

275

assert access_mode in ('r', 'w'), "invalid mode specified %r" % access_mode

276

assert not basis_knit or isinstance(basis_knit, KnitVersionedFile), \

277

type(basis_knit)

278

279

self.transport = transport

280

self.filename = relpath

281

self.basis_knit = basis_knit

282

self.factory = factory or KnitAnnotateFactory()

283

self.writable = (access_mode == 'w')

284

self.delta = delta

285

286

self._index = _KnitIndex(transport, relpath + INDEX_SUFFIX,

287

access_mode, create=create)

288

self._data = _KnitData(transport, relpath + DATA_SUFFIX,

289

access_mode, create=not len(self.versions()))

290

291

def clear_cache(self):

292

"""Clear the data cache only."""

293

self._data.clear_cache()

294

295

def copy_to(self, name, transport):

296

"""See VersionedFile.copy_to()."""

297

# copy the current index to a temp index to avoid racing with local

298

# writes

299

transport.put(name + INDEX_SUFFIX + '.tmp', self.transport.get(self._index._filename))

300

# copy the data file

301

transport.put(name + DATA_SUFFIX, self._data._open_file())

302

# rename the copied index into place

303

transport.rename(name + INDEX_SUFFIX + '.tmp', name + INDEX_SUFFIX)

304

305

def create_empty(self, name, transport, mode=None):

306

return KnitVersionedFile(name, transport, factory=self.factory, delta=self.delta, create=True)

307

308

def _fix_parents(self, version, new_parents):

309

"""Fix the parents list for version.

310

311

This is done by appending a new version to the index

312

with identical data except for the parents list.

313

the parents list must be a superset of the current

314

list.

315

"""

316

current_values = self._index._cache[version]

317

assert set(current_values[4]).difference(set(new_parents)) == set()

318

self._index.add_version(version,

319

current_values[1],

320

current_values[2],

321

current_values[3],

322

new_parents)

323

324

def get_graph_with_ghosts(self):

325

"""See VersionedFile.get_graph_with_ghosts()."""

326

graph_items = self._index.get_graph()

327

return dict(graph_items)

328

329

@staticmethod

330

def get_suffixes():

331

"""See VersionedFile.get_suffixes()."""

332

return [DATA_SUFFIX, INDEX_SUFFIX]

333

334

def has_ghost(self, version_id):

335

"""True if there is a ghost reference in the file to version_id."""

336

# maybe we have it

337

if self.has_version(version_id):

338

return False

339

# optimisable if needed by memoising the _ghosts set.

340

items = self._index.get_graph()

341

for node, parents in items:

342

for parent in parents:

343

if parent not in self._index._cache:

344

if parent == version_id:

345

return True

346

return False

347

348

def versions(self):

349

"""See VersionedFile.versions."""

350

return self._index.get_versions()

351

352

def has_version(self, version_id):

353

"""See VersionedFile.has_version."""

354

return self._index.has_version(version_id)

355

356

__contains__ = has_version

357

358

def _merge_annotations(self, content, parents, parent_texts={},

359

delta=None, annotated=None):

360

"""Merge annotations for content. This is done by comparing

361

the annotations based on changed to the text.

362

"""

363

diff_hunks = []

364

if delta:

365

reference_content = self._get_content(parents[0], parent_texts)

366

new_texts = [text for origin, text in content._lines]

367

old_texts = [text for origin, text in reference_content._lines]

368

delta_seq = SequenceMatcher(None, old_texts, new_texts)

369

for op in delta_seq.get_opcodes():

370

if op[0] == 'equal':

371

continue

372

diff_hunks.append((op[1], op[2], op[4]-op[3], content._lines[op[3]:op[4]]))

373

if annotated:

374

# reuse the delta_sequencer if possible:

375

if delta:

376

for i, j, n in delta_seq.get_matching_blocks():

377

if n == 0:

378

continue

379

# this appears to copy (origin, text) pairs across to the new

380

# content for any line that matches the last-checked parent.

381

# FIXME: save the sequence control data for delta compression

382

# against the most relevant parent rather than rediffing.

383

content._lines[j:j+n] = reference_content._lines[i:i+n]

384

for parent_id in parents[1:]:

385

merge_content = self._get_content(parent_id, parent_texts)

386

seq = SequenceMatcher(None, merge_content.text(), content.text())

387

for i, j, n in seq.get_matching_blocks():

388

if n == 0:

389

continue

390

# this appears to copy (origin, text) pairs across to the new

391

# content for any line that matches the last-checked parent.

392

# FIXME: save the sequence control data for delta compression

393

# against the most relevant parent rather than rediffing.

394

content._lines[j:j+n] = merge_content._lines[i:i+n]

395

return diff_hunks

396

397

def _get_components(self, version_id):

398

"""Return a list of (version_id, method, data) tuples that

399

makes up version specified by version_id of the knit.

400

401

The components should be applied in the order of the returned

402

list.

403

404

The basis knit will be used to the largest extent possible

405

since it is assumed that accesses to it is faster.

406

"""

407

# needed_revisions holds a list of (method, version_id) of

408

# versions that is needed to be fetched to construct the final

409

# version of the file.

410

411

# basis_revisions is a list of versions that needs to be

412

# fetched but exists in the basis knit.

413

414

basis = self.basis_knit

415

needed_versions = []

416

basis_versions = []

417

cursor = version_id

418

419

while 1:

420

picked_knit = self

421

if basis and basis._index.has_version(cursor):

422

picked_knit = basis

423

basis_versions.append(cursor)

424

method = picked_knit._index.get_method(cursor)

425

needed_versions.append((method, cursor))

426

if method == 'fulltext':

427

break

428

cursor = picked_knit.get_parents(cursor)[0]

429

430

components = {}

431

if basis_versions:

432

records = []

433

for comp_id in basis_versions:

434

data_pos, data_size = basis._index.get_data_position(comp_id)

435

records.append((piece_id, data_pos, data_size))

436

components.update(basis._data.read_records(records))

437

438

records = []

439

for comp_id in [vid for method, vid in needed_versions

440

if vid not in basis_versions]:

441

data_pos, data_size = self._index.get_position(comp_id)

442

records.append((comp_id, data_pos, data_size))

443

components.update(self._data.read_records(records))

444

445

# get_data_records returns a mapping with the version id as

446

# index and the value as data. The order the components need

447

# to be applied is held by needed_versions (reversed).

448

out = []

449

for method, comp_id in reversed(needed_versions):

450

out.append((comp_id, method, components[comp_id]))

451

452

return out

453

454

def _get_content(self, version_id, parent_texts={}):

455

"""Returns a content object that makes up the specified

456

version."""

457

if not self.has_version(version_id):

458

raise RevisionNotPresent(version_id, self.filename)

459

460

cached_version = parent_texts.get(version_id, None)

461

if cached_version is not None:

462

return cached_version

463

464

if self.basis_knit and version_id in self.basis_knit:

465

return self.basis_knit._get_content(version_id)

466

467

content = None

468

components = self._get_components(version_id)

469

for component_id, method, (data, digest) in components:

470

version_idx = self._index.lookup(component_id)

471

if method == 'fulltext':

472

assert content is None

473

content = self.factory.parse_fulltext(data, version_idx)

474

elif method == 'line-delta':

475

delta = self.factory.parse_line_delta(data, version_idx)

476

content.apply_delta(delta)

477

478

if 'no-eol' in self._index.get_options(version_id):

479

line = content._lines[-1][1].rstrip('\n')

480

content._lines[-1] = (content._lines[-1][0], line)

481

482

if sha_strings(content.text()) != digest:

483

raise KnitCorrupt(self.filename, 'sha-1 does not match')

484

485

return content

486

487

def _check_versions_present(self, version_ids):

488

"""Check that all specified versions are present."""

489

version_ids = set(version_ids)

490

for r in list(version_ids):

491

if self._index.has_version(r):

492

version_ids.remove(r)

493

if version_ids:

494

raise RevisionNotPresent(list(version_ids)[0], self.filename)

495

496

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

497

"""See VersionedFile.add_lines_with_ghosts()."""

498

self._check_add(version_id, lines)

499

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

500

501

def _add_lines(self, version_id, parents, lines, parent_texts):

502

"""See VersionedFile.add_lines."""

503

self._check_add(version_id, lines)

504

self._check_versions_present(parents)

505

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

506

507

def _check_add(self, version_id, lines):

508

"""check that version_id and lines are safe to add."""

509

assert self.writable, "knit is not opened for write"

510

### FIXME escape. RBC 20060228

511

if contains_whitespace(version_id):

512

raise InvalidRevisionId(version_id)

513

if self.has_version(version_id):

514

raise RevisionAlreadyPresent(version_id, self.filename)

515

516

if False or __debug__:

517

for l in lines:

518

assert '\n' not in l[:-1]

519

520

def _add(self, version_id, lines, parents, delta, parent_texts):

521

"""Add a set of lines on top of version specified by parents.

522

523

If delta is true, compress the text as a line-delta against

524

the first parent.

525

526

Any versions not present will be converted into ghosts.

527

"""

528

# 461 0 6546.0390 43.9100 bzrlib.knit:489(_add)

529

# +400 0 889.4890 418.9790 +bzrlib.knit:192(lower_fulltext)

530

# +461 0 1364.8070 108.8030 +bzrlib.knit:996(add_record)

531

# +461 0 193.3940 41.5720 +bzrlib.knit:898(add_version)

532

# +461 0 134.0590 18.3810 +bzrlib.osutils:361(sha_strings)

533

# +461 0 36.3420 15.4540 +bzrlib.knit:146(make)

534

# +1383 0 8.0370 8.0370 +<len>

535

# +61 0 13.5770 7.9190 +bzrlib.knit:199(lower_line_delta)

536

# +61 0 963.3470 7.8740 +bzrlib.knit:427(_get_content)

537

# +61 0 973.9950 5.2950 +bzrlib.knit:136(line_delta)

538

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

539

540

present_parents = []

541

ghosts = []

542

if parent_texts is None:

543

parent_texts = {}

544

for parent in parents:

545

if not self.has_version(parent):

546

ghosts.append(parent)

547

else:

548

present_parents.append(parent)

549

550

if delta and not len(present_parents):

551

delta = False

552

553

digest = sha_strings(lines)

554

options = []

555

if lines:

556

if lines[-1][-1] != '\n':

557

options.append('no-eol')

558

lines[-1] = lines[-1] + '\n'

559

560

if len(present_parents) and delta:

561

# To speed the extract of texts the delta chain is limited

562

# to a fixed number of deltas. This should minimize both

563

# I/O and the time spend applying deltas.

564

count = 0

565

delta_parents = present_parents

566

while count < 25:

567

parent = delta_parents[0]

568

method = self._index.get_method(parent)

569

if method == 'fulltext':

570

break

571

delta_parents = self._index.get_parents(parent)

572

count = count + 1

573

if method == 'line-delta':

574

delta = False

575

576

lines = self.factory.make(lines, version_id)

577

if delta or (self.factory.annotated and len(present_parents) > 0):

578

# Merge annotations from parent texts if so is needed.

579

delta_hunks = self._merge_annotations(lines, present_parents, parent_texts,

580

delta, self.factory.annotated)

581

582

if delta:

583

options.append('line-delta')

584

store_lines = self.factory.lower_line_delta(delta_hunks)

585

else:

586

options.append('fulltext')

587

store_lines = self.factory.lower_fulltext(lines)

588

589

where, size = self._data.add_record(version_id, digest, store_lines)

590

self._index.add_version(version_id, options, where, size, parents)

591

return lines

592

593

def check(self, progress_bar=None):

594

"""See VersionedFile.check()."""

595

596

def _clone_text(self, new_version_id, old_version_id, parents):

597

"""See VersionedFile.clone_text()."""

598

# FIXME RBC 20060228 make fast by only inserting an index with null delta.

599

self.add_lines(new_version_id, parents, self.get_lines(old_version_id))

600

601

def get_lines(self, version_id):

602

"""See VersionedFile.get_lines()."""

603

return self._get_content(version_id).text()

604

605

def iter_lines_added_or_present_in_versions(self, version_ids=None):

606

"""See VersionedFile.iter_lines_added_or_present_in_versions()."""

607

if version_ids is None:

608

version_ids = self.versions()

609

# we dont care about inclusions, the caller cares.

610

# but we need to setup a list of records to visit.

611

# we need version_id, position, length

612

version_id_records = []

613

requested_versions = list(version_ids)

614

# filter for available versions

615

for version_id in requested_versions:

616

if not self.has_version(version_id):

617

raise RevisionNotPresent(version_id, self.filename)

618

# get a in-component-order queue:

619

version_ids = []

620

for version_id in self.versions():

621

if version_id in requested_versions:

622

version_ids.append(version_id)

623

data_pos, length = self._index.get_position(version_id)

624

version_id_records.append((version_id, data_pos, length))

625

626

pb = bzrlib.ui.ui_factory.nested_progress_bar()

627

count = 0

628

total = len(version_id_records)

629

try:

630

pb.update('Walking content.', count, total)

631

for version_id, data, sha_value in \

632

self._data.read_records_iter(version_id_records):

633

pb.update('Walking content.', count, total)

634

method = self._index.get_method(version_id)

635

version_idx = self._index.lookup(version_id)

636

assert method in ('fulltext', 'line-delta')

637

if method == 'fulltext':

638

content = self.factory.parse_fulltext(data, version_idx)

639

for line in content.text():

640

yield line

641

else:

642

delta = self.factory.parse_line_delta(data, version_idx)

643

for start, end, count, lines in delta:

644

for origin, line in lines:

645

yield line

646

count +=1

647

pb.update('Walking content.', total, total)

648

pb.finished()

649

except:

650

pb.update('Walking content.', total, total)

651

pb.finished()

652

raise

653

654

def num_versions(self):

655

"""See VersionedFile.num_versions()."""

656

return self._index.num_versions()

657

658

__len__ = num_versions

659

660

def annotate_iter(self, version_id):

661

"""See VersionedFile.annotate_iter."""

662

content = self._get_content(version_id)

663

for origin, text in content.annotate_iter():

664

yield origin, text

665

666

def get_parents(self, version_id):

667

"""See VersionedFile.get_parents."""

668

self._check_versions_present([version_id])

669

return list(self._index.get_parents(version_id))

670

671

def get_parents_with_ghosts(self, version_id):

672

"""See VersionedFile.get_parents."""

673

self._check_versions_present([version_id])

674

return list(self._index.get_parents_with_ghosts(version_id))

675

676

def get_ancestry(self, versions):

677

"""See VersionedFile.get_ancestry."""

678

if isinstance(versions, basestring):

679

versions = [versions]

680

if not versions:

681

return []

682

self._check_versions_present(versions)

683

return self._index.get_ancestry(versions)

684

685

def get_ancestry_with_ghosts(self, versions):

686

"""See VersionedFile.get_ancestry_with_ghosts."""

687

if isinstance(versions, basestring):

688

versions = [versions]

689

if not versions:

690

return []

691

self._check_versions_present(versions)

692

return self._index.get_ancestry_with_ghosts(versions)

693

694

#@deprecated_method(zero_eight)

695

def walk(self, version_ids):

696

"""See VersionedFile.walk."""

697

# We take the short path here, and extract all relevant texts

698

# and put them in a weave and let that do all the work. Far

699

# from optimal, but is much simpler.

700

# FIXME RB 20060228 this really is inefficient!

701

from bzrlib.weave import Weave

702

703

w = Weave(self.filename)

704

ancestry = self.get_ancestry(version_ids)

705

sorted_graph = topo_sort(self._index.get_graph())

706

version_list = [vid for vid in sorted_graph if vid in ancestry]

707

708

for version_id in version_list:

709

lines = self.get_lines(version_id)

710

w.add_lines(version_id, self.get_parents(version_id), lines)

711

712

for lineno, insert_id, dset, line in w.walk(version_ids):

713

yield lineno, insert_id, dset, line

714

715

716

class _KnitComponentFile(object):

717

"""One of the files used to implement a knit database"""

718

719

def __init__(self, transport, filename, mode):

720

self._transport = transport

721

self._filename = filename

722

self._mode = mode

723

724

def write_header(self):

725

if self._transport.append(self._filename, StringIO(self.HEADER)):

726

raise KnitCorrupt(self._filename, 'misaligned after writing header')

727

728

def check_header(self, fp):

729

line = fp.read(len(self.HEADER))

730

if line != self.HEADER:

731

raise KnitHeaderError(badline=line)

732

733

def commit(self):

734

"""Commit is a nop."""

735

736

def __repr__(self):

737

return '%s(%s)' % (self.__class__.__name__, self._filename)

738

739

740

class _KnitIndex(_KnitComponentFile):

741

"""Manages knit index file.

742

743

The index is already kept in memory and read on startup, to enable

744

fast lookups of revision information. The cursor of the index

745

file is always pointing to the end, making it easy to append

746

entries.

747

748

_cache is a cache for fast mapping from version id to a Index

749

object.

750

751

_history is a cache for fast mapping from indexes to version ids.

752

753

The index data format is dictionary compressed when it comes to

754

parent references; a index entry may only have parents that with a

755

lover index number. As a result, the index is topological sorted.

756

757

Duplicate entries may be written to the index for a single version id

758

if this is done then the latter one completely replaces the former:

759

this allows updates to correct version and parent information.

760

Note that the two entries may share the delta, and that successive

761

annotations and references MUST point to the first entry.

762

"""

763

764

HEADER = "# bzr knit index 7\n"

765

766

# speed of knit parsing went from 280 ms to 280 ms with slots addition.

767

# __slots__ = ['_cache', '_history', '_transport', '_filename']

768

769

def _cache_version(self, version_id, options, pos, size, parents):

770

"""Cache a version record in the history array and index cache.

771

772

This is inlined into __init__ for performance. KEEP IN SYNC.

773

(It saves 60ms, 25% of the __init__ overhead on local 4000 record

774

indexes).

775

"""

776

# only want the _history index to reference the 1st index entry

777

# for version_id

778

if version_id not in self._cache:

779

self._history.append(version_id)

780

self._cache[version_id] = (version_id, options, pos, size, parents)

781

782

def __init__(self, transport, filename, mode, create=False):

783

_KnitComponentFile.__init__(self, transport, filename, mode)

784

self._cache = {}

785

# position in _history is the 'official' index for a revision

786

# but the values may have come from a newer entry.

787

# so - wc -l of a knit index is != the number of uniqe names

788

# in the weave.

789

self._history = []

790

pb = bzrlib.ui.ui_factory.nested_progress_bar()

791

try:

792

count = 0

793

total = 1

794

try:

795

pb.update('read knit index', count, total)

796

fp = self._transport.get(self._filename)

797

self.check_header(fp)

798

# readlines reads the whole file at once:

799

# bad for transports like http, good for local disk

800

# we save 60 ms doing this one change (

801

# from calling readline each time to calling

802

# readlines once.

803

# probably what we want for nice behaviour on

804

# http is a incremental readlines that yields, or

805

# a check for local vs non local indexes,

806

for l in fp.readlines():

807

rec = l.split()

808

count += 1

809

total += 1

810

#pb.update('read knit index', count, total)

811

# See self._parse_parents

812

parents = []

813

for value in rec[4:]:

814

if '.' == value[0]:

815

# uncompressed reference

816

parents.append(value[1:])

817

else:

818

# this is 15/4000ms faster than isinstance,

819

# (in lsprof)

820

# this function is called thousands of times a

821

# second so small variations add up.

822

assert value.__class__ is str

823

parents.append(self._history[int(value)])

824

# end self._parse_parents

825

# self._cache_version(rec[0],

826

# rec[1].split(','),

827

# int(rec[2]),

828

# int(rec[3]),

829

# parents)

830

# --- self._cache_version

831

# only want the _history index to reference the 1st

832

# index entry for version_id

833

version_id = rec[0]

834

if version_id not in self._cache:

835

self._history.append(version_id)

836

self._cache[version_id] = (version_id,

837

rec[1].split(','),

838

int(rec[2]),

839

int(rec[3]),

840

parents)

841

# --- self._cache_version

842

except NoSuchFile, e:

843

if mode != 'w' or not create:

844

raise

845

self.write_header()

846

finally:

847

pb.update('read knit index', total, total)

848

pb.finished()

849

850

def _parse_parents(self, compressed_parents):

851

"""convert a list of string parent values into version ids.

852

853

ints are looked up in the index.

854

.FOO values are ghosts and converted in to FOO.

855

856

NOTE: the function is retained here for clarity, and for possible

857

use in partial index reads. However bulk processing now has

858

it inlined in __init__ for inner-loop optimisation.

859

"""

860

result = []

861

for value in compressed_parents:

862

if value[-1] == '.':

863

# uncompressed reference

864

result.append(value[1:])

865

else:

866

# this is 15/4000ms faster than isinstance,

867

# this function is called thousands of times a

868

# second so small variations add up.

869

assert value.__class__ is str

870

result.append(self._history[int(value)])

871

return result

872

873

def get_graph(self):

874

graph = []

875

for version_id, index in self._cache.iteritems():

876

graph.append((version_id, index[4]))

877

return graph

878

879

def get_ancestry(self, versions):

880

"""See VersionedFile.get_ancestry."""

881

# get a graph of all the mentioned versions:

882

graph = {}

883

pending = set(versions)

884

while len(pending):

885

version = pending.pop()

886

parents = self._cache[version][4]

887

# got the parents ok

888

# trim ghosts

889

parents = [parent for parent in parents if parent in self._cache]

890

for parent in parents:

891

# if not completed and not a ghost

892

if parent not in graph:

893

pending.add(parent)

894

graph[version] = parents

895

return topo_sort(graph.items())

896

897

def get_ancestry_with_ghosts(self, versions):

898

"""See VersionedFile.get_ancestry_with_ghosts."""

899

# get a graph of all the mentioned versions:

900

graph = {}

901

pending = set(versions)

902

while len(pending):

903

version = pending.pop()

904

try:

905

parents = self._cache[version][4]

906

except KeyError:

907

# ghost, fake it

908

graph[version] = []

909

pass

910

else:

911

# got the parents ok

912

for parent in parents:

913

if parent not in graph:

914

pending.add(parent)

915

graph[version] = parents

916

return topo_sort(graph.items())

917

918

def num_versions(self):

919

return len(self._history)

920

921

__len__ = num_versions

922

923

def get_versions(self):

924

return self._history

925

926

def idx_to_name(self, idx):

927

return self._history[idx]

928

929

def lookup(self, version_id):

930

assert version_id in self._cache

931

return self._history.index(version_id)

932

933

def _version_list_to_index(self, versions):

934

result_list = []

935

for version in versions:

936

if version in self._cache:

937

result_list.append(str(self._history.index(version)))

938

else:

939

result_list.append('.' + version.encode('utf-8'))

940

return ' '.join(result_list)

941

942

def add_version(self, version_id, options, pos, size, parents):

943

"""Add a version record to the index."""

944

self._cache_version(version_id, options, pos, size, parents)

945

946

content = "%s %s %s %s %s\n" % (version_id.encode('utf-8'),

947

','.join(options),

948

pos,

949

size,

950

self._version_list_to_index(parents))

951

assert isinstance(content, str), 'content must be utf-8 encoded'

952

self._transport.append(self._filename, StringIO(content))

953

954

def has_version(self, version_id):

955

"""True if the version is in the index."""

956

return self._cache.has_key(version_id)

957

958

def get_position(self, version_id):

959

"""Return data position and size of specified version."""

960

return (self._cache[version_id][2], \

961

self._cache[version_id][3])

962

963

def get_method(self, version_id):

964

"""Return compression method of specified version."""

965

options = self._cache[version_id][1]

966

if 'fulltext' in options:

967

return 'fulltext'

968

else:

969

assert 'line-delta' in options

970

return 'line-delta'

971

972

def get_options(self, version_id):

973

return self._cache[version_id][1]

974

975

def get_parents(self, version_id):

976

"""Return parents of specified version ignoring ghosts."""

977

return [parent for parent in self._cache[version_id][4]

978

if parent in self._cache]

979

980

def get_parents_with_ghosts(self, version_id):

981

"""Return parents of specified version wth ghosts."""

982

return self._cache[version_id][4]

983

984

def check_versions_present(self, version_ids):

985

"""Check that all specified versions are present."""

986

version_ids = set(version_ids)

987

for version_id in list(version_ids):

988

if version_id in self._cache:

989

version_ids.remove(version_id)

990

if version_ids:

991

raise RevisionNotPresent(list(version_ids)[0], self.filename)

992

993

994

class _KnitData(_KnitComponentFile):

995

"""Contents of the knit data file"""

996

997

HEADER = "# bzr knit data 7\n"

998

999

def __init__(self, transport, filename, mode, create=False):

1000

_KnitComponentFile.__init__(self, transport, filename, mode)

1001

self._file = None

1002

self._checked = False

1003

if create:

1004

self._transport.put(self._filename, StringIO(''))

1005

self._records = {}

1006

1007

def clear_cache(self):

1008

"""Clear the record cache."""

1009

self._records = {}

1010

1011

def _open_file(self):

1012

if self._file is None:

1013

try:

1014

self._file = self._transport.get(self._filename)

1015

except NoSuchFile:

1016

pass

1017

return self._file

1018

1019

def _record_to_data(self, version_id, digest, lines):

1020

"""Convert version_id, digest, lines into a raw data block.

1021

1022

:return: (len, a StringIO instance with the raw data ready to read.)

1023

"""

1024

sio = StringIO()

1025

data_file = GzipFile(None, mode='wb', fileobj=sio)

1026

data_file.writelines(chain(

1027

["version %s %d %s\n" % (version_id.encode('utf-8'),

1028

len(lines),

1029

digest)],

1030

lines,

1031

["end %s\n\n" % version_id.encode('utf-8')]))

1032

data_file.close()

1033

length= sio.tell()

1034

1035

sio.seek(0)

1036

return length, sio

1037

1038

def add_raw_record(self, raw_data):

1039

"""Append a prepared record to the data file."""

1040

assert isinstance(raw_data, str), 'data must be plain bytes'

1041

start_pos = self._transport.append(self._filename, StringIO(raw_data))

1042

return start_pos, len(raw_data)

1043

1044

def add_record(self, version_id, digest, lines):

1045

"""Write new text record to disk. Returns the position in the

1046

file where it was written."""

1047

size, sio = self._record_to_data(version_id, digest, lines)

1048

# cache

1049

self._records[version_id] = (digest, lines)

1050

# write to disk

1051

start_pos = self._transport.append(self._filename, sio)

1052

return start_pos, size

1053

1054

def _parse_record_header(self, version_id, raw_data):

1055

"""Parse a record header for consistency.

1056

1057

:return: the header and the decompressor stream.

1058

as (stream, header_record)

1059

"""

1060

df = GzipFile(mode='rb', fileobj=StringIO(raw_data))

1061

rec = df.readline().split()

1062

if len(rec) != 4:

1063

raise KnitCorrupt(self._filename, 'unexpected number of elements in record header')

1064

if rec[1].decode('utf-8')!= version_id:

1065

raise KnitCorrupt(self._filename,

1066

'unexpected version, wanted %r, got %r' % (

1067

version_id, rec[1]))

1068

return df, rec

1069

1070

def _parse_record(self, version_id, data):

1071

df, rec = self._parse_record_header(version_id, data)

1072

lines = int(rec[2])

1073

record_contents = self._read_record_contents(df, lines)

1074

l = df.readline()

1075

if l.decode('utf-8') != 'end %s\n' % version_id:

1076

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1077

% (l, version_id))

1078

df.close()

1079

return record_contents, rec[3]

1080

1081

def _read_record_contents(self, df, record_lines):

1082

"""Read and return n lines from datafile."""

1083

r = []

1084

for i in range(record_lines):

1085

r.append(df.readline())

1086

return r

1087

1088

def read_records_iter_raw(self, records):

1089

"""Read text records from data file and yield raw data.

1090

1091

This unpacks enough of the text record to validate the id is

1092

as expected but thats all.

1093

1094

It will actively recompress currently cached records on the

1095

basis that that is cheaper than I/O activity.

1096

"""

1097

needed_records = []

1098

for version_id, pos, size in records:

1099

if version_id not in self._records:

1100

needed_records.append((version_id, pos, size))

1101

1102

# setup an iterator of the external records:

1103

# uses readv so nice and fast we hope.

1104

if len(needed_records):

1105

# grab the disk data needed.

1106

raw_records = self._transport.readv(self._filename,

1107

[(pos, size) for version_id, pos, size in needed_records])

1108

1109

for version_id, pos, size in records:

1110

if version_id in self._records:

1111

# compress a new version

1112

size, sio = self._record_to_data(version_id,

1113

self._records[version_id][0],

1114

self._records[version_id][1])

1115

yield version_id, sio.getvalue()

1116

else:

1117

pos, data = raw_records.next()

1118

# validate the header

1119

df, rec = self._parse_record_header(version_id, data)

1120

df.close()

1121

yield version_id, data

1122

1123

1124

def read_records_iter(self, records):

1125

"""Read text records from data file and yield result.

1126

1127

Each passed record is a tuple of (version_id, pos, len) and

1128

will be read in the given order. Yields (version_id,

1129

contents, digest).

1130

"""

1131

1132

needed_records = []

1133

for version_id, pos, size in records:

1134

if version_id not in self._records:

1135

needed_records.append((version_id, pos, size))

1136

1137

if len(needed_records):

1138

# We take it that the transport optimizes the fetching as good

1139

# as possible (ie, reads continous ranges.)

1140

response = self._transport.readv(self._filename,

1141

[(pos, size) for version_id, pos, size in needed_records])

1142

1143

for (record_id, pos, size), (pos, data) in izip(iter(needed_records), response):

1144

content, digest = self._parse_record(record_id, data)

1145

self._records[record_id] = (digest, content)

1146

1147

for version_id, pos, size in records:

1148

yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])

1149

1150

def read_records(self, records):

1151

"""Read records into a dictionary."""

1152

components = {}

1153

for record_id, content, digest in self.read_records_iter(records):

1154

components[record_id] = (content, digest)

1155

return components

1156

1157

1158

class InterKnit(InterVersionedFile):

1159

"""Optimised code paths for knit to knit operations."""

1160

1161

_matching_file_factory = KnitVersionedFile

1162

1163

@staticmethod

1164

def is_compatible(source, target):

1165

"""Be compatible with knits. """

1166

try:

1167

return (isinstance(source, KnitVersionedFile) and

1168

isinstance(target, KnitVersionedFile))

1169

except AttributeError:

1170

return False

1171

1172

def join(self, pb=None, msg=None, version_ids=None, ignore_missing=False):

1173

"""See InterVersionedFile.join."""

1174

assert isinstance(self.source, KnitVersionedFile)

1175

assert isinstance(self.target, KnitVersionedFile)

1176

1177

if version_ids is None:

1178

version_ids = self.source.versions()

1179

else:

1180

if not ignore_missing:

1181

self.source._check_versions_present(version_ids)

1182

else:

1183

version_ids = set(self.source.versions()).intersection(

1184

set(version_ids))

1185

1186

if not version_ids:

1187

return 0

1188

1189

pb = bzrlib.ui.ui_factory.nested_progress_bar()

1190

try:

1191

version_ids = list(version_ids)

1192

if None in version_ids:

1193

version_ids.remove(None)

1194

1195

self.source_ancestry = set(self.source.get_ancestry(version_ids))

1196

this_versions = set(self.target._index.get_versions())

1197

needed_versions = self.source_ancestry - this_versions

1198

cross_check_versions = self.source_ancestry.intersection(this_versions)

1199

mismatched_versions = set()

1200

for version in cross_check_versions:

1201

# scan to include needed parents.

1202

n1 = set(self.target.get_parents_with_ghosts(version))

1203

n2 = set(self.source.get_parents_with_ghosts(version))

1204

if n1 != n2:

1205

# FIXME TEST this check for cycles being introduced works

1206

# the logic is we have a cycle if in our graph we are an

1207

# ancestor of any of the n2 revisions.

1208

for parent in n2:

1209

if parent in n1:

1210

# safe

1211

continue

1212

else:

1213

parent_ancestors = self.source.get_ancestry(parent)

1214

if version in parent_ancestors:

1215

raise errors.GraphCycleError([parent, version])

1216

# ensure this parent will be available later.

1217

new_parents = n2.difference(n1)

1218

needed_versions.update(new_parents.difference(this_versions))

1219

mismatched_versions.add(version)

1220

1221

if not needed_versions and not cross_check_versions:

1222

return 0

1223

full_list = topo_sort(self.source.get_graph())

1224

1225

version_list = [i for i in full_list if (not self.target.has_version(i)

1226

and i in needed_versions)]

1227

1228

# plan the join:

1229

copy_queue = []

1230

copy_queue_records = []

1231

copy_set = set()

1232

for version_id in version_list:

1233

options = self.source._index.get_options(version_id)

1234

parents = self.source._index.get_parents_with_ghosts(version_id)

1235

# check that its will be a consistent copy:

1236

for parent in parents:

1237

# if source has the parent, we must :

1238

# * already have it or

1239

# * have it scheduled already

1240

# otherwise we dont care

1241

assert (self.target.has_version(parent) or

1242

parent in copy_set or

1243

not self.source.has_version(parent))

1244

data_pos, data_size = self.source._index.get_position(version_id)

1245

copy_queue_records.append((version_id, data_pos, data_size))

1246

copy_queue.append((version_id, options, parents))

1247

copy_set.add(version_id)

1248

1249

# data suck the join:

1250

count = 0

1251

total = len(version_list)

1252

# we want the raw gzip for bulk copying, but the record validated

1253

# just enough to be sure its the right one.

1254

# TODO: consider writev or write combining to reduce

1255

# death of a thousand cuts feeling.

1256

for (version_id, raw_data), \

1257

(version_id2, options, parents) in \

1258

izip(self.source._data.read_records_iter_raw(copy_queue_records),

1259

copy_queue):

1260

assert version_id == version_id2, 'logic error, inconsistent results'

1261

count = count + 1

1262

pb.update("Joining knit", count, total)

1263

pos, size = self.target._data.add_raw_record(raw_data)

1264

self.target._index.add_version(version_id, options, pos, size, parents)

1265

1266

for version in mismatched_versions:

1267

# FIXME RBC 20060309 is this needed?

1268

n1 = set(self.target.get_parents_with_ghosts(version))

1269

n2 = set(self.source.get_parents_with_ghosts(version))

1270

# write a combined record to our history preserving the current

1271

# parents as first in the list

1272

new_parents = self.target.get_parents_with_ghosts(version) + list(n2.difference(n1))

1273

self.target.fix_parents(version, new_parents)

1274

return count

1275

finally:

1276

pb.finished()

1277

1278

1279

InterVersionedFile.register_optimiser(InterKnit)

1280

1281

1282

# make GzipFile faster:

1283

import zlib

1284

class GzipFile(gzip.GzipFile):

1285

"""Knit tuned version of GzipFile.

1286

1287

This is based on the following lsprof stats:

1288

python 2.4 stock GzipFile write:

1289

58971 0 5644.3090 2721.4730 gzip:193(write)

1290

+58971 0 1159.5530 1159.5530 +<built-in method compress>

1291

+176913 0 987.0320 987.0320 +<len>

1292

+58971 0 423.1450 423.1450 +<zlib.crc32>

1293

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

1294

StringO' objects>

1295

tuned GzipFile write:

1296

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

1297

+58971 0 1297.7620 1297.7620 +<built-in method compress>

1298

+58971 0 406.2160 406.2160 +<zlib.crc32>

1299

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

1300

StringO' objects>

1301

+58971 0 328.2670 328.2670 +<len>

1302

1303

1304

Yes, its only 1.6 seconds, but they add up.

1305

"""

1306

1307

def write(self, data):

1308

if self.mode != gzip.WRITE:

1309

import errno

1310

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

1311

1312

if self.fileobj is None:

1313

raise ValueError, "write() on closed GzipFile object"

1314

data_len = len(data)

1315

if data_len > 0:

1316

self.size = self.size + data_len

1317

self.crc = zlib.crc32(data, self.crc)

1318

self.fileobj.write( self.compress.compress(data) )

1319

self.offset += data_len

1320

1321

def writelines(self, lines):

1322

# profiling indicated a significant overhead

1323

# calling write for each line.

1324

# this batch call is a lot faster :).

1325

# (4 seconds to 1 seconds for the sample upgrades I was testing).

1326

self.write(''.join(lines))

1327

1328

1329

class SequenceMatcher(difflib.SequenceMatcher):

1330

"""Knit tuned sequence matcher.

1331

1332

This is based on profiling of difflib which indicated some improvements

1333

for our usage pattern.

1334

"""

1335

1336

def find_longest_match(self, alo, ahi, blo, bhi):

1337

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1338

1339

If isjunk is not defined:

1340

1341

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1342

alo <= i <= i+k <= ahi

1343

blo <= j <= j+k <= bhi

1344

and for all (i',j',k') meeting those conditions,

1345

k >= k'

1346

i <= i'

1347

and if i == i', j <= j'

1348

1349

In other words, of all maximal matching blocks, return one that

1350

starts earliest in a, and of all those maximal matching blocks that

1351

start earliest in a, return the one that starts earliest in b.

1352

1353

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1354

>>> s.find_longest_match(0, 5, 0, 9)

1355

(0, 4, 5)

1356

1357

If isjunk is defined, first the longest matching block is

1358

determined as above, but with the additional restriction that no

1359

junk element appears in the block. Then that block is extended as

1360

far as possible by matching (only) junk elements on both sides. So

1361

the resulting block never matches on junk except as identical junk

1362

happens to be adjacent to an "interesting" match.

1363

1364

Here's the same example as before, but considering blanks to be

1365

junk. That prevents " abcd" from matching the " abcd" at the tail

1366

end of the second sequence directly. Instead only the "abcd" can

1367

match, and matches the leftmost "abcd" in the second sequence:

1368

1369

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1370

>>> s.find_longest_match(0, 5, 0, 9)

1371

(1, 0, 4)

1372

1373

If no blocks match, return (alo, blo, 0).

1374

1375

>>> s = SequenceMatcher(None, "ab", "c")

1376

>>> s.find_longest_match(0, 2, 0, 1)

1377

(0, 0, 0)

1378

"""

1379

1380

# CAUTION: stripping common prefix or suffix would be incorrect.

1381

# E.g.,

1382

# ab

1383

# acab

1384

# Longest matching block is "ab", but if common prefix is

1385

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1386

# strip, so ends up claiming that ab is changed to acab by

1387

# inserting "ca" in the middle. That's minimal but unintuitive:

1388

# "it's obvious" that someone inserted "ac" at the front.

1389

# Windiff ends up at the same place as diff, but by pairing up

1390

# the unique 'b's and then matching the first two 'a's.

1391

1392

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1393

besti, bestj, bestsize = alo, blo, 0

1394

# find longest junk-free match

1395

# during an iteration of the loop, j2len[j] = length of longest

1396

# junk-free match ending with a[i-1] and b[j]

1397

j2len = {}

1398

# nothing = []

1399

b2jget = b2j.get

1400

for i in xrange(alo, ahi):

1401

# look at all instances of a[i] in b; note that because

1402

# b2j has no junk keys, the loop is skipped if a[i] is junk

1403

j2lenget = j2len.get

1404

newj2len = {}

1405

1406

# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the

1407

# following improvement

1408

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1409

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1410

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1411

# to

1412

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1413

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1414

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1415

1416

try:

1417

js = b2j[a[i]]

1418

except KeyError:

1419

pass

1420

else:

1421

for j in js:

1422

# a[i] matches b[j]

1423

if j >= blo:

1424

if j >= bhi:

1425

break

1426

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1427

if k > bestsize:

1428

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1429

j2len = newj2len

1430

1431

# Extend the best by non-junk elements on each end. In particular,

1432

# "popular" non-junk elements aren't in b2j, which greatly speeds

1433

# the inner loop above, but also means "the best" match so far

1434

# doesn't contain any junk *or* popular non-junk elements.

1435

while besti > alo and bestj > blo and \

1436

not isbjunk(b[bestj-1]) and \

1437

a[besti-1] == b[bestj-1]:

1438

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1439

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1440

not isbjunk(b[bestj+bestsize]) and \

1441

a[besti+bestsize] == b[bestj+bestsize]:

1442

bestsize += 1

1443

1444

# Now that we have a wholly interesting match (albeit possibly

1445

# empty!), we may as well suck up the matching junk on each

1446

# side of it too. Can't think of a good reason not to, and it

1447

# saves post-processing the (possibly considerable) expense of

1448

# figuring out what to do with it. In the case of an empty

1449

# interesting match, this is clearly the right thing to do,

1450

# because no other kind of match is possible in the regions.

1451

while besti > alo and bestj > blo and \

1452

isbjunk(b[bestj-1]) and \

1453

a[besti-1] == b[bestj-1]:

1454

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1455

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1456

isbjunk(b[bestj+bestsize]) and \

1457

a[besti+bestsize] == b[bestj+bestsize]:

1458

bestsize = bestsize + 1

1459

1460

return besti, bestj, bestsize

1461

Older »