1325
1324
# this batch call is a lot faster :).
1326
1325
# (4 seconds to 1 seconds for the sample upgrades I was testing).
1327
1326
self.write(''.join(lines))
1329
class SequenceMatcher(difflib.SequenceMatcher):
1330
"""Knit tuned sequence matcher.
1332
This is based on profiling of difflib which indicated some improvements
1333
for our usage pattern.
1336
def find_longest_match(self, alo, ahi, blo, bhi):
1337
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
1339
If isjunk is not defined:
1341
Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
1342
alo <= i <= i+k <= ahi
1343
blo <= j <= j+k <= bhi
1344
and for all (i',j',k') meeting those conditions,
1347
and if i == i', j <= j'
1349
In other words, of all maximal matching blocks, return one that
1350
starts earliest in a, and of all those maximal matching blocks that
1351
start earliest in a, return the one that starts earliest in b.
1353
>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
1354
>>> s.find_longest_match(0, 5, 0, 9)
1357
If isjunk is defined, first the longest matching block is
1358
determined as above, but with the additional restriction that no
1359
junk element appears in the block. Then that block is extended as
1360
far as possible by matching (only) junk elements on both sides. So
1361
the resulting block never matches on junk except as identical junk
1362
happens to be adjacent to an "interesting" match.
1364
Here's the same example as before, but considering blanks to be
1365
junk. That prevents " abcd" from matching the " abcd" at the tail
1366
end of the second sequence directly. Instead only the "abcd" can
1367
match, and matches the leftmost "abcd" in the second sequence:
1369
>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
1370
>>> s.find_longest_match(0, 5, 0, 9)
1373
If no blocks match, return (alo, blo, 0).
1375
>>> s = SequenceMatcher(None, "ab", "c")
1376
>>> s.find_longest_match(0, 2, 0, 1)
1380
# CAUTION: stripping common prefix or suffix would be incorrect.
1384
# Longest matching block is "ab", but if common prefix is
1385
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
1386
# strip, so ends up claiming that ab is changed to acab by
1387
# inserting "ca" in the middle. That's minimal but unintuitive:
1388
# "it's obvious" that someone inserted "ac" at the front.
1389
# Windiff ends up at the same place as diff, but by pairing up
1390
# the unique 'b's and then matching the first two 'a's.
1392
a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
1393
besti, bestj, bestsize = alo, blo, 0
1394
# find longest junk-free match
1395
# during an iteration of the loop, j2len[j] = length of longest
1396
# junk-free match ending with a[i-1] and b[j]
1400
for i in xrange(alo, ahi):
1401
# look at all instances of a[i] in b; note that because
1402
# b2j has no junk keys, the loop is skipped if a[i] is junk
1403
j2lenget = j2len.get
1406
# changing b2j.get(a[i], nothing) to a try:Keyerror pair produced the
1407
# following improvement
1408
# 704 0 4650.5320 2620.7410 bzrlib.knit:1336(find_longest_match)
1409
# +326674 0 1655.1210 1655.1210 +<method 'get' of 'dict' objects>
1410
# +76519 0 374.6700 374.6700 +<method 'has_key' of 'dict' objects>
1412
# 704 0 3733.2820 2209.6520 bzrlib.knit:1336(find_longest_match)
1413
# +211400 0 1147.3520 1147.3520 +<method 'get' of 'dict' objects>
1414
# +76519 0 376.2780 376.2780 +<method 'has_key' of 'dict' objects>
1426
k = newj2len[j] = 1 + j2lenget(-1 + j, 0)
1428
besti, bestj, bestsize = 1 + i-k, 1 + j-k, k
1431
# Extend the best by non-junk elements on each end. In particular,
1432
# "popular" non-junk elements aren't in b2j, which greatly speeds
1433
# the inner loop above, but also means "the best" match so far
1434
# doesn't contain any junk *or* popular non-junk elements.
1435
while besti > alo and bestj > blo and \
1436
not isbjunk(b[bestj-1]) and \
1437
a[besti-1] == b[bestj-1]:
1438
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1439
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1440
not isbjunk(b[bestj+bestsize]) and \
1441
a[besti+bestsize] == b[bestj+bestsize]:
1444
# Now that we have a wholly interesting match (albeit possibly
1445
# empty!), we may as well suck up the matching junk on each
1446
# side of it too. Can't think of a good reason not to, and it
1447
# saves post-processing the (possibly considerable) expense of
1448
# figuring out what to do with it. In the case of an empty
1449
# interesting match, this is clearly the right thing to do,
1450
# because no other kind of match is possible in the regions.
1451
while besti > alo and bestj > blo and \
1452
isbjunk(b[bestj-1]) and \
1453
a[besti-1] == b[bestj-1]:
1454
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
1455
while besti+bestsize < ahi and bestj+bestsize < bhi and \
1456
isbjunk(b[bestj+bestsize]) and \
1457
a[besti+bestsize] == b[bestj+bestsize]:
1458
bestsize = bestsize + 1
1460
return besti, bestj, bestsize