/brz/remove-bazaar : revision 1596.2.35

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-03-12 17:09:11 UTC
mto: (1615.1.2 bzr.mbp.integration)
mto: This revision was merged to the branch mainline in revision 1616.
Revision ID: robertc@robertcollins.net-20060312170911-306a47e0478ec183

Subclass SequenceMatcher to get a slightly faster (in our case) find_longest_match routine.

files modified:
bzrlib/knit.py

bzrlib/progress.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

from copy import copy

from cStringIO import StringIO

import difflib

from difflib import SequenceMatcher

import gzip

from itertools import izip, chain

import os

127

126

"""Generate line-based delta from this content to new_lines."""

128

127

new_texts = [text for origin, text in new_lines._lines]

129

128

old_texts = [text for origin, text in self._lines]

130

s = difflib.SequenceMatcher(None, old_texts, new_texts)

129

s = SequenceMatcher(None, old_texts, new_texts)

131

130

for op in s.get_opcodes():

132

131

if op[0] == 'equal':

133

132

continue

366

365

reference_content = self._get_content(parents[0], parent_texts)

367

366

new_texts = [text for origin, text in content._lines]

368

367

old_texts = [text for origin, text in reference_content._lines]

369

delta_seq = difflib.SequenceMatcher(None, old_texts, new_texts)

368

delta_seq = SequenceMatcher(None, old_texts, new_texts)

370

369

for op in delta_seq.get_opcodes():

371

370

if op[0] == 'equal':

372

371

continue

1325

1324

# this batch call is a lot faster :).

1326

1325

# (4 seconds to 1 seconds for the sample upgrades I was testing).

1327

1326

self.write(''.join(lines))

1327

1328

1329

class SequenceMatcher(difflib.SequenceMatcher):

1330

"""Knit tuned sequence matcher.

1331

1332

This is based on profiling of difflib which indicated some improvements

1333

for our usage pattern.

1334

"""

1335

1336

def find_longest_match(self, alo, ahi, blo, bhi):

1337

"""Find longest matching block in a[alo:ahi] and b[blo:bhi].

1338

1339

If isjunk is not defined:

1340

1341

Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where

1342

alo <= i <= i+k <= ahi

1343

blo <= j <= j+k <= bhi

1344

and for all (i',j',k') meeting those conditions,

1345

k >= k'

1346

i <= i'

1347

and if i == i', j <= j'

1348

1349

In other words, of all maximal matching blocks, return one that

1350

starts earliest in a, and of all those maximal matching blocks that

1351

start earliest in a, return the one that starts earliest in b.

1352

1353

>>> s = SequenceMatcher(None, " abcd", "abcd abcd")

1354

>>> s.find_longest_match(0, 5, 0, 9)

1355

(0, 4, 5)

1356

1357

If isjunk is defined, first the longest matching block is

1358

determined as above, but with the additional restriction that no

1359

junk element appears in the block. Then that block is extended as

1360

far as possible by matching (only) junk elements on both sides. So

1361

the resulting block never matches on junk except as identical junk

1362

happens to be adjacent to an "interesting" match.

1363

1364

Here's the same example as before, but considering blanks to be

1365

junk. That prevents " abcd" from matching the " abcd" at the tail

1366

end of the second sequence directly. Instead only the "abcd" can

1367

match, and matches the leftmost "abcd" in the second sequence:

1368

1369

>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")

1370

>>> s.find_longest_match(0, 5, 0, 9)

1371

(1, 0, 4)

1372

1373

If no blocks match, return (alo, blo, 0).

1374

1375

>>> s = SequenceMatcher(None, "ab", "c")

1376

>>> s.find_longest_match(0, 2, 0, 1)

1377

(0, 0, 0)

1378

"""

1379

1380

# CAUTION: stripping common prefix or suffix would be incorrect.

1381

# E.g.,

1382

# ab

1383

# acab

1384

# Longest matching block is "ab", but if common prefix is

1385

# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so

1386

# strip, so ends up claiming that ab is changed to acab by

1387

# inserting "ca" in the middle. That's minimal but unintuitive:

1388

# "it's obvious" that someone inserted "ac" at the front.

1389

# Windiff ends up at the same place as diff, but by pairing up

1390

# the unique 'b's and then matching the first two 'a's.

1391

1392

a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk

1393

besti, bestj, bestsize = alo, blo, 0

1394

# find longest junk-free match

1395

# during an iteration of the loop, j2len[j] = length of longest

1396

# junk-free match ending with a[i-1] and b[j]

1397

j2len = {}

1398

# nothing = []

1399

b2jget = b2j.get

1400

for i in xrange(alo, ahi):

1401

# look at all instances of a[i] in b; note that because

1402

# b2j has no junk keys, the loop is skipped if a[i] is junk

1403

j2lenget = j2len.get

1404

newj2len = {}

1405

1406

# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the

1407

# following improvement

1408

# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)

1409

# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>

1410

# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>

1411

# to

1412

# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)

1413

# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>

1414

# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>

1415

1416

try:

1417

js = b2j[a[i]]

1418

except KeyError:

1419

pass

1420

else:

1421

for j in js:

1422

# a[i] matches b[j]

1423

if j >= blo:

1424

if j >= bhi:

1425

break

1426

k = newj2len[j] = 1 + j2lenget(-1 + j, 0)

1427

if k > bestsize:

1428

besti, bestj, bestsize = 1 + i-k, 1 + j-k, k

1429

j2len = newj2len

1430

1431

# Extend the best by non-junk elements on each end. In particular,

1432

# "popular" non-junk elements aren't in b2j, which greatly speeds

1433

# the inner loop above, but also means "the best" match so far

1434

# doesn't contain any junk *or* popular non-junk elements.

1435

while besti > alo and bestj > blo and \

1436

not isbjunk(b[bestj-1]) and \

1437

a[besti-1] == b[bestj-1]:

1438

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1439

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1440

not isbjunk(b[bestj+bestsize]) and \

1441

a[besti+bestsize] == b[bestj+bestsize]:

1442

bestsize += 1

1443

1444

# Now that we have a wholly interesting match (albeit possibly

1445

# empty!), we may as well suck up the matching junk on each

1446

# side of it too. Can't think of a good reason not to, and it

1447

# saves post-processing the (possibly considerable) expense of

1448

# figuring out what to do with it. In the case of an empty

1449

# interesting match, this is clearly the right thing to do,

1450

# because no other kind of match is possible in the regions.

1451

while besti > alo and bestj > blo and \

1452

isbjunk(b[bestj-1]) and \

1453

a[besti-1] == b[bestj-1]:

1454

besti, bestj, bestsize = besti-1, bestj-1, bestsize+1

1455

while besti+bestsize < ahi and bestj+bestsize < bhi and \

1456

isbjunk(b[bestj+bestsize]) and \

1457

a[besti+bestsize] == b[bestj+bestsize]:

1458

bestsize = bestsize + 1

1459

1460

return besti, bestj, bestsize

1461

Older »