/brz/remove-bazaar : revision 2796.2.12

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Aaron Bentley
Date: 2007-09-13 01:54:49 UTC
mfrom: (2817 +trunk)
mto: This revision was merged to the branch mainline in revision 2826.
Revision ID: aaron.bentley@utoronto.ca-20070913015449-bjjw2njf3in5roq7

Merge bzr.dev

files added:
bzrlib/_patiencediff_c.c

bzrlib/patiencediff.py

bzrlib/tests/commands/test_update.py

files renamed:
bzrlib/patiencediff.py => bzrlib/_patiencediff_py.py

files modified:
NEWS

bzrlib/__init__.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/commands.py

bzrlib/errors.py

bzrlib/help_topics.py

bzrlib/knit.py

bzrlib/reconcile.py

bzrlib/repofmt/knitrepo.py

bzrlib/repofmt/weaverepo.py

bzrlib/repository.py

bzrlib/store/revision/knit.py

bzrlib/tag.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_version.py

bzrlib/tests/branch_implementations/test_sprout.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/commands/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/intertree_implementations/test_compare.py

bzrlib/tests/repository_implementations/test_commit_builder.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_ftp_transport.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_revert.py

bzrlib/tests/test_tag.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/workingtree_implementations/test_executable.py

bzrlib/tests/workingtree_implementations/test_inv.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/transport/__init__.py

bzrlib/transport/ftp.py

bzrlib/tree.py

bzrlib/version.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/workingtree.py

bzrlib/workingtree_4.py

doc/developers/HACKING.txt

setup.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from itertools import izip, chain

import operator

import os

134

133

class KnitContent(object):

135

134

"""Content of a knit version to which deltas can be applied."""

136

135

137

def __init__(self, lines):

138

self._lines = lines

139

140

def annotate_iter(self):

141

"""Yield tuples of (origin, text) for each content line."""

142

return iter(self._lines)

143

144

136

def annotate(self):

145

137

"""Return a list of (origin, text) tuples."""

146

138

return list(self.annotate_iter())

149

141

"""Generate line-based delta from this content to new_lines."""

150

142

new_texts = new_lines.text()

151

143

old_texts = self.text()

152

s = KnitSequenceMatcher(None, old_texts, new_texts)

144

s = patiencediff.PatienceSequenceMatcher(None, old_texts, new_texts)

153

145

for tag, i1, i2, j1, j2 in s.get_opcodes():

154

146

if tag == 'equal':

155

147

continue

159

151

def line_delta(self, new_lines):

160

152

return list(self.line_delta_iter(new_lines))

161

153

162

def text(self):

163

return [text for origin, text in self._lines]

164

165

def copy(self):

166

return KnitContent(self._lines[:])

167

168

154

@staticmethod

169

155

def get_line_delta_blocks(knit_delta, source, target):

170

156

"""Extract SequenceMatcher.get_matching_blocks() from a knit delta"""

192

178

yield s_pos + (target_len - t_pos), target_len, 0

193

179

194

180

195

class _KnitFactory(object):

196

"""Base factory for creating content objects."""

197

198

def make(self, lines, version_id):

199

num_lines = len(lines)

200

return KnitContent(zip([version_id] * num_lines, lines))

201

202

203

class KnitAnnotateFactory(_KnitFactory):

181

class AnnotatedKnitContent(KnitContent):

182

"""Annotated content."""

183

184

def __init__(self, lines):

185

self._lines = lines

186

187

def annotate_iter(self):

188

"""Yield tuples of (origin, text) for each content line."""

189

return iter(self._lines)

190

191

def strip_last_line_newline(self):

192

line = self._lines[-1][1].rstrip('\n')

193

self._lines[-1] = (self._lines[-1][0], line)

194

195

def text(self):

196

return [text for origin, text in self._lines]

197

198

def copy(self):

199

return AnnotatedKnitContent(self._lines[:])

200

201

202

class PlainKnitContent(KnitContent):

203

"""Unannotated content.

204

205

When annotate[_iter] is called on this content, the same version is reported

206

for all lines. Generally, annotate[_iter] is not useful on PlainKnitContent

207

objects.

208

"""

209

210

def __init__(self, lines, version_id):

211

self._lines = lines

212

self._version_id = version_id

213

214

def annotate_iter(self):

215

"""Yield tuples of (origin, text) for each content line."""

216

for line in self._lines:

217

yield self._version_id, line

218

219

def copy(self):

220

return PlainKnitContent(self._lines[:], self._version_id)

221

222

def strip_last_line_newline(self):

223

self._lines[-1] = self._lines[-1].rstrip('\n')

224

225

def text(self):

226

return self._lines

227

228

229

class KnitAnnotateFactory(object):

204

230

"""Factory for creating annotated Content objects."""

205

231

206

232

annotated = True

207

233

234

def make(self, lines, version_id):

235

num_lines = len(lines)

236

return AnnotatedKnitContent(zip([version_id] * num_lines, lines))

237

208

238

def parse_fulltext(self, content, version_id):

209

239

"""Convert fulltext to internal representation

210

240

218

248

# Figure out a way to not require the overhead of turning the

219

249

# list back into tuples.

220

250

lines = [tuple(line.split(' ', 1)) for line in content]

221

return KnitContent(lines)

251

return AnnotatedKnitContent(lines)

222

252

223

253

def parse_line_delta_iter(self, lines):

224

254

return iter(self.parse_line_delta(lines))

296

326

return content.annotate_iter()

297

327

298

328

299

class KnitPlainFactory(_KnitFactory):

329

class KnitPlainFactory(object):

300

330

"""Factory for creating plain Content objects."""

301

331

302

332

annotated = False

303

333

334

def make(self, lines, version_id):

335

return PlainKnitContent(lines, version_id)

336

304

337

def parse_fulltext(self, content, version_id):

305

338

"""This parses an unannotated fulltext.

306

339

316

349

header = lines[cur]

317

350

cur += 1

318

351

start, end, c = [int(n) for n in header.split(',')]

319

yield start, end, c, zip([version_id] * c, lines[cur:cur+c])

352

yield start, end, c, lines[cur:cur+c]

320

353

cur += c

321

354

322

355

def parse_line_delta(self, lines, version_id):

347

380

out = []

348

381

for start, end, c, lines in delta:

349

382

out.append('%d,%d,%d\n' % (start, end, c))

350

out.extend([text for origin, text in lines])

383

out.extend(lines)

351

384

return out

352

385

353

386

def annotate_iter(self, knit, version_id):

453

486

454

487

return fulltext_size > delta_size

455

488

456

def _add_delta(self, version_id, parents, delta_parent, sha1, noeol, delta):

457

"""See VersionedFile._add_delta()."""

458

self._check_add(version_id, []) # should we check the lines ?

459

self._check_versions_present(parents)

460

present_parents = []

461

ghosts = []

462

parent_texts = {}

463

for parent in parents:

464

if not self.has_version(parent):

465

ghosts.append(parent)

466

else:

467

present_parents.append(parent)

468

469

if delta_parent is None:

470

# reconstitute as full text.

471

assert len(delta) == 1 or len(delta) == 0

472

if len(delta):

473

assert delta[0][0] == 0

474

assert delta[0][1] == 0, delta[0][1]

475

return super(KnitVersionedFile, self)._add_delta(version_id,

476

parents,

477

delta_parent,

478

sha1,

479

noeol,

480

delta)

481

482

digest = sha1

483

484

options = []

485

if noeol:

486

options.append('no-eol')

487

488

if delta_parent is not None:

489

# determine the current delta chain length.

490

# To speed the extract of texts the delta chain is limited

491

# to a fixed number of deltas. This should minimize both

492

# I/O and the time spend applying deltas.

493

# The window was changed to a maximum of 200 deltas, but also added

494

# was a check that the total compressed size of the deltas is

495

# smaller than the compressed size of the fulltext.

496

if not self._check_should_delta([delta_parent]):

497

# We don't want a delta here, just do a normal insertion.

498

return super(KnitVersionedFile, self)._add_delta(version_id,

499

parents,

500

delta_parent,

501

sha1,

502

noeol,

503

delta)

504

505

options.append('line-delta')

506

store_lines = self.factory.lower_line_delta(delta)

507

508

access_memo = self._data.add_record(version_id, digest, store_lines)

509

self._index.add_version(version_id, options, access_memo, parents)

510

511

489

def _add_raw_records(self, records, data):

512

490

"""Add all the records 'records' with data pre-joined in 'data'.

513

491

642

620

"""Get a delta for constructing version from some other version."""

643

621

version_id = osutils.safe_revision_id(version_id)

644

622

self.check_not_reserved_id(version_id)

645

if not self.has_version(version_id):

646

raise RevisionNotPresent(version_id, self.filename)

647

648

623

parents = self.get_parents(version_id)

649

624

if len(parents):

650

625

parent = parents[0]

661

636

else:

662

637

old_texts = []

663

638

new_texts = new_content.text()

664

delta_seq = KnitSequenceMatcher(None, old_texts, new_texts)

639

delta_seq = patiencediff.PatienceSequenceMatcher(None, old_texts,

640

new_texts)

665

641

return parent, sha1, noeol, self._make_line_delta(delta_seq, new_content)

666

642

else:

667

643

delta = self.factory.parse_line_delta(data, version_id)

841

817

def _get_content(self, version_id, parent_texts={}):

842

818

"""Returns a content object that makes up the specified

843

819

version."""

844

if not self.has_version(version_id):

845

raise RevisionNotPresent(version_id, self.filename)

846

847

820

cached_version = parent_texts.get(version_id, None)

848

821

if cached_version is not None:

822

if not self.has_version(version_id):

823

raise RevisionNotPresent(version_id, self.filename)

849

824

return cached_version

850

825

851

826

text_map, contents_map = self._get_content_maps([version_id])

855

830

"""Check that all specified versions are present."""

856

831

self._index.check_versions_present(version_ids)

857

832

858

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts):

833

def _add_lines_with_ghosts(self, version_id, parents, lines, parent_texts,

834

nostore_sha, random_id, check_content):

859

835

"""See VersionedFile.add_lines_with_ghosts()."""

860

self._check_add(version_id, lines)

861

return self._add(version_id, lines[:], parents, self.delta, parent_texts)

836

self._check_add(version_id, lines, random_id, check_content)

837

return self._add(version_id, lines, parents, self.delta,

838

parent_texts, None, nostore_sha)

862

839

863

840

def _add_lines(self, version_id, parents, lines, parent_texts,

864

left_matching_blocks=None):

841

left_matching_blocks, nostore_sha, random_id, check_content):

865

842

"""See VersionedFile.add_lines."""

866

self._check_add(version_id, lines)

843

self._check_add(version_id, lines, random_id, check_content)

867

844

self._check_versions_present(parents)

868

845

return self._add(version_id, lines[:], parents, self.delta,

869

parent_texts, left_matching_blocks)

846

parent_texts, left_matching_blocks, nostore_sha)

870

847

871

def _check_add(self, version_id, lines):

848

def _check_add(self, version_id, lines, random_id, check_content):

872

849

"""check that version_id and lines are safe to add."""

873

assert self.writable, "knit is not opened for write"

874

### FIXME escape. RBC 20060228

875

850

if contains_whitespace(version_id):

876

851

raise InvalidRevisionId(version_id, self.filename)

877

852

self.check_not_reserved_id(version_id)

878

if self.has_version(version_id):

853

# Technically this could be avoided if we are happy to allow duplicate

854

# id insertion when other things than bzr core insert texts, but it

855

# seems useful for folk using the knit api directly to have some safety

856

# blanket that we can disable.

857

if not random_id and self.has_version(version_id):

879

858

raise RevisionAlreadyPresent(version_id, self.filename)

880

self._check_lines_not_unicode(lines)

881

self._check_lines_are_lines(lines)

859

if check_content:

860

self._check_lines_not_unicode(lines)

861

self._check_lines_are_lines(lines)

882

862

883

863

def _add(self, version_id, lines, parents, delta, parent_texts,

884

left_matching_blocks=None):

864

left_matching_blocks, nostore_sha):

885

865

"""Add a set of lines on top of version specified by parents.

886

866

887

867

If delta is true, compress the text as a line-delta against

902

882

# +61 0 1918.1800 5.2640 +bzrlib.knit:359(_merge_annotations)

903

883

904

884

present_parents = []

905

ghosts = []

906

885

if parent_texts is None:

907

886

parent_texts = {}

908

887

for parent in parents:

909

if not self.has_version(parent):

910

ghosts.append(parent)

911

else:

888

if self.has_version(parent):

912

889

present_parents.append(parent)

913

890

914

if delta and not len(present_parents):

891

# can only compress against the left most present parent.

892

if (delta and

893

(len(present_parents) == 0 or

894

present_parents[0] != parents[0])):

915

895

delta = False

916

896

917

897

digest = sha_strings(lines)

898

if nostore_sha == digest:

899

raise errors.ExistingContent

918

900

text_length = sum(map(len, lines))

919

901

options = []

920

902

if lines:

921

903

if lines[-1][-1] != '\n':

904

# copy the contents of lines.

905

lines = lines[:]

922

906

options.append('no-eol')

923

907

lines[-1] = lines[-1] + '\n'

924

908

925

if len(present_parents) and delta:

909

if delta:

926

910

# To speed the extract of texts the delta chain is limited

927

911

# to a fixed number of deltas. This should minimize both

928

912

# I/O and the time spend applying deltas.

929

913

delta = self._check_should_delta(present_parents)

930

914

931

915

assert isinstance(version_id, str)

932

lines = self.factory.make(lines, version_id)

916

content = self.factory.make(lines, version_id)

933

917

if delta or (self.factory.annotated and len(present_parents) > 0):

934

# Merge annotations from parent texts if so is needed.

935

delta_hunks = self._merge_annotations(lines, present_parents,

918

# Merge annotations from parent texts if needed.

919

delta_hunks = self._merge_annotations(content, present_parents,

936

920

parent_texts, delta, self.factory.annotated,

937

921

left_matching_blocks)

938

922

941

925

store_lines = self.factory.lower_line_delta(delta_hunks)

942

926

else:

943

927

options.append('fulltext')

944

store_lines = self.factory.lower_fulltext(lines)

928

store_lines = self.factory.lower_fulltext(content)

945

929

946

930

access_memo = self._data.add_record(version_id, digest, store_lines)

947

931

self._index.add_version(version_id, options, access_memo, parents)

948

return digest, text_length, lines

932

return digest, text_length, content

949

933

950

934

def check(self, progress_bar=None):

951

935

"""See VersionedFile.check()."""

1035

1019

elif method == 'line-delta':

1036

1020

delta = self.factory.parse_line_delta(data, version_id)

1037

1021

content = content.copy()

1038

content._lines = self._apply_delta(content._lines,

1022

content._lines = self._apply_delta(content._lines,

1039

1023

delta)

1040

1024

content_map[component_id] = content

1041

1025

1042

1026

if 'no-eol' in self._index.get_options(version_id):

1043

1027

content = content.copy()

1044

line = content._lines[-1][1].rstrip('\n')

1045

content._lines[-1] = (content._lines[-1][0], line)

1028

content.strip_last_line_newline()

1046

1029

final_content[version_id] = content

1047

1030

1048

1031

# digest here is the digest from the last applied component.

1049

1032

text = content.text()

1050

1033

if sha_strings(text) != digest:

1051

raise KnitCorrupt(self.filename,

1034

raise KnitCorrupt(self.filename,

1052

1035

'sha-1 does not match %s' % version_id)

1053

1036

1054

text_map[version_id] = text

1055

return text_map, final_content

1037

text_map[version_id] = text

1038

return text_map, final_content

1056

1039

1057

1040

def iter_lines_added_or_present_in_versions(self, version_ids=None,

1058

1041

pb=None):

2395

2378

InterVersionedFile.register_optimiser(WeaveToKnit)

2396

2379

2397

2380

2398

class KnitSequenceMatcher(difflib.SequenceMatcher):

2399

"""Knit tuned sequence matcher.

2400

2401

This is based on profiling of difflib which indicated some improvements

2402

for our usage pattern.

2403

"""

2404

2405

def find_longest_match(self, alo, ahi, blo, bhi):

2406

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

2407

2408

If isjunk is not defined:

2409

2410

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

2411

alo <= i <= i+k <= ahi

2412

blo <= j <= j+k <= bhi

2413

and for all (i',j',k') meeting those conditions,

2414

k >= k'

2415

i <= i'

2416

and if i == i', j <= j'

2417

2418

In other words, of all maximal matching blocks, return one that

2419

starts earliest in a, and of all those maximal matching blocks that

2420

start earliest in a, return the one that starts earliest in b.

2421

2422

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

2423

>>> s.find_longest_match(0, 5, 0, 9)

2424

(0, 4, 5)

2425

2426

If isjunk is defined, first the longest matching block is

2427

determined as above, but with the additional restriction that no

2428

junk element appears in the block. Then that block is extended as

2429

far as possible by matching (only) junk elements on both sides. So

2430

the resulting block never matches on junk except as identical junk

2431

happens to be adjacent to an "interesting" match.

2432

2433

Here's the same example as before, but considering blanks to be

2434

junk. That prevents " abcd" from matching the " abcd" at the tail

2435

end of the second sequence directly. Instead only the "abcd" can

2436

match, and matches the leftmost "abcd" in the second sequence:

2437

2438

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

2439

>>> s.find_longest_match(0, 5, 0, 9)

2440

(1, 0, 4)

2441

2442

If no blocks match, return (alo, blo, 0).

2443

2444

>>> s = SequenceMatcher(None, "ab", "c")

2445

>>> s.find_longest_match(0, 2, 0, 1)

2446

(0, 0, 0)

2447

"""

2448

2449

# CAUTION: stripping common prefix or suffix would be incorrect.

2450

# E.g.,

2451

# ab

2452

# acab

2453

# Longest matching block is "ab", but if common prefix is

2454

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

2455

# strip, so ends up claiming that ab is changed to acab by

2456

# inserting "ca" in the middle. That's minimal but unintuitive:

2457

# "it's obvious" that someone inserted "ac" at the front.

2458

# Windiff ends up at the same place as diff, but by pairing up

2459

# the unique 'b's and then matching the first two 'a's.

2460

2461

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

2462

besti, bestj, bestsize = alo, blo, 0

2463

# find longest junk-free match

2464

# during an iteration of the loop, j2len[j] = length of longest

2465

# junk-free match ending with a[i-1] and b[j]

2466

j2len = {}

2467

# nothing = []

2468

b2jget = b2j.get

2469

for i in xrange(alo, ahi):

2470

# look at all instances of a[i] in b; note that because

2471

# b2j has no junk keys, the loop is skipped if a[i] is junk

2472

j2lenget = j2len.get

2473

newj2len = {}

2474

2475

# changing b2j.get(a[i], nothing) to a try:KeyError pair produced the

2476

# following improvement

2477

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

2478

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

2479

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

2480

# to

2481

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

2482

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

2483

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

2484

2485

try:

2486

js = b2j[a[i]]

2487

except KeyError:

2488

pass

2489

else:

2490

for j in js:

2491

# a[i] matches b[j]

2492

if j >= blo:

2493

if j >= bhi:

2494

break

2495

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

2496

if k > bestsize:

2497

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

2498

j2len = newj2len

2499

2500

# Extend the best by non-junk elements on each end. In particular,

2501

# "popular" non-junk elements aren't in b2j, which greatly speeds

2502

# the inner loop above, but also means "the best" match so far

2503

# doesn't contain any junk *or* popular non-junk elements.

2504

while besti > alo and bestj > blo and \

2505

not isbjunk(b[bestj-1]) and \

2506

a[besti-1] == b[bestj-1]:

2507

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2508

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2509

not isbjunk(b[bestj+bestsize]) and \

2510

a[besti+bestsize] == b[bestj+bestsize]:

2511

bestsize += 1

2512

2513

# Now that we have a wholly interesting match (albeit possibly

2514

# empty!), we may as well suck up the matching junk on each

2515

# side of it too. Can't think of a good reason not to, and it

2516

# saves post-processing the (possibly considerable) expense of

2517

# figuring out what to do with it. In the case of an empty

2518

# interesting match, this is clearly the right thing to do,

2519

# because no other kind of match is possible in the regions.

2520

while besti > alo and bestj > blo and \

2521

isbjunk(b[bestj-1]) and \

2522

a[besti-1] == b[bestj-1]:

2523

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

2524

while besti+bestsize < ahi and bestj+bestsize < bhi and \

2525

isbjunk(b[bestj+bestsize]) and \

2526

a[besti+bestsize] == b[bestj+bestsize]:

2527

bestsize = bestsize + 1

2528

2529

return besti, bestj, bestsize

2381

# Deprecated, use PatienceSequenceMatcher instead

2382

KnitSequenceMatcher = patiencediff.PatienceSequenceMatcher

2530

2383

2531

2384

2532

2385

def annotate_knit(knit, revision_id):

Older »