/brz/remove-bazaar : contents of bzrlib/patiencediff.py at revision 1732.1.4

: (revision 1732.1.4)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	1	#!/usr/bin/env python
1185.81.24 by Aaron Bentley Reoganize patience-related code	2	# Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
1185.81.24 by Aaron Bentley Reoganize patience-related code	3	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
1185.81.24 by Aaron Bentley Reoganize patience-related code	8	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
1185.81.24 by Aaron Bentley Reoganize patience-related code	13	#
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17
	18
1185.81.24 by Aaron Bentley Reoganize patience-related code	19	from bisect import bisect
1185.81.24 by Aaron Bentley Reoganize patience-related code	20	from copy import copy
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	21	import difflib
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	22	import os
	23	import sys
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	24	import time
	25
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	26
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	27	__all__ = ['SequenceMatcher', 'unified_diff', 'unified_diff_files']
1185.81.9 by John Arbash Meinel Added (failing) tests for cdv.recurse_matches with common sections,	28
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	29
1185.81.24 by Aaron Bentley Reoganize patience-related code	30	def unique_lcs(a, b):
	31	"""Find the longest common subset for unique lines.
	32
	33	:param a: An indexable object (such as string or list of strings)
	34	:param b: Another indexable object (such as string or list of strings)
	35	:return: A list of tuples, one for each line which is matched.
	36	[(line_in_a, line_in_b), ...]
	37
	38	This only matches lines which are unique on both sides.
	39	This helps prevent common lines from over influencing match
	40	results.
	41	The longest common subset uses the Patience Sorting algorithm:
	42	http://en.wikipedia.org/wiki/Patience_sorting
	43	"""
	44	# set index[line in a] = position of line in a unless
	45	# unless a is a duplicate, in which case it's set to None
	46	index = {}
	47	for i in xrange(len(a)):
	48	line = a[i]
	49	if line in index:
	50	index[line] = None
	51	else:
	52	index[line]= i
	53	# make btoa[i] = position of line i in a, unless
	54	# that line doesn't occur exactly once in both,
	55	# in which case it's set to None
	56	btoa = [None] * len(b)
	57	index2 = {}
	58	for pos, line in enumerate(b):
	59	next = index.get(line)
	60	if next is not None:
	61	if line in index2:
	62	# unset the previous mapping, which we now know to
	63	# be invalid because the line isn't unique
	64	btoa[index2[line]] = None
	65	del index[line]
	66	else:
	67	index2[line] = pos
	68	btoa[pos] = next
	69	# this is the Patience sorting algorithm
	70	# see http://en.wikipedia.org/wiki/Patience_sorting
	71	backpointers = [None] * len(b)
	72	stacks = []
	73	lasts = []
	74	k = 0
	75	for bpos, apos in enumerate(btoa):
	76	if apos is None:
	77	continue
	78	# as an optimization, check if the next line comes at the end,
	79	# because it usually does
	80	if stacks and stacks[-1] < apos:
	81	k = len(stacks)
	82	# as an optimization, check if the next line comes right after
	83	# the previous line, because usually it does
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	84	elif stacks and stacks[k] < apos and (k == len(stacks) - 1 or
	85	stacks[k+1] > apos):
1185.81.24 by Aaron Bentley Reoganize patience-related code	86	k += 1
	87	else:
	88	k = bisect(stacks, apos)
	89	if k > 0:
	90	backpointers[bpos] = lasts[k-1]
	91	if k < len(stacks):
	92	stacks[k] = apos
	93	lasts[k] = bpos
	94	else:
	95	stacks.append(apos)
	96	lasts.append(bpos)
	97	if len(lasts) == 0:
	98	return []
	99	result = []
	100	k = lasts[-1]
	101	while k is not None:
	102	result.append((btoa[k], k))
	103	k = backpointers[k]
	104	result.reverse()
	105	return result
	106
	107
	108	def recurse_matches(a, b, ahi, bhi, answer, maxrecursion):
	109	"""Find all of the matching text in the lines of a and b.
	110
	111	:param a: A sequence
	112	:param b: Another sequence
	113	:param ahi: The maximum length of a to check, typically len(a)
	114	:param bhi: The maximum length of b to check, typically len(b)
	115	:param answer: The return array. Will be filled with tuples
	116	indicating [(line_in_a), (line_in_b)]
	117	:param maxrecursion: The maximum depth to recurse.
	118	Must be a positive integer.
	119	:return: None, the return value is in the parameter answer, which
	120	should be a list
	121
	122	"""
	123	oldlen = len(answer)
	124	if maxrecursion < 0:
	125	# this will never happen normally, this check is to prevent DOS attacks
	126	return
	127	oldlength = len(answer)
	128	if len(answer) == 0:
	129	alo, blo = 0, 0
	130	else:
	131	alo, blo = answer[-1]
	132	alo += 1
	133	blo += 1
	134	if alo == ahi or blo == bhi:
	135	return
	136	for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
	137	# recurse between lines which are unique in each file and match
	138	apos += alo
	139	bpos += blo
	140	recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1)
	141	answer.append((apos, bpos))
	142	if len(answer) > oldlength:
	143	# find matches between the last match and the end
	144	recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
	145	elif a[alo] == b[blo]:
	146	# find matching lines at the very beginning
	147	while alo < ahi and blo < bhi and a[alo] == b[blo]:
	148	answer.append((alo, blo))
	149	alo += 1
150	blo += 1
151	recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1)
152	elif a[ahi - 1] == b[bhi - 1]:
153	# find matching lines at the very end
154	nahi = ahi - 1
155	nbhi = bhi - 1
156	while nahi > alo and nbhi > blo and a[nahi - 1] == b[nbhi - 1]:
157	nahi -= 1
158	nbhi -= 1
159	recurse_matches(a, b, nahi, nbhi, answer, maxrecursion - 1)
160	for i in xrange(ahi - nahi):
161	answer.append((nahi + i, nbhi + i))
162
163
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	164	class SequenceMatcher(difflib.SequenceMatcher):
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	165	"""Compare a pair of sequences using longest common subset."""
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	166
1185.81.5 by John Arbash Meinel Fix up SequenceMatcher, add comments to nofrillsprecisemerge	167	def __init__(self, isjunk=None, a='', b=''):
	168	if isjunk is not None:
	169	raise NotImplementedError('Currently we do not support'
	170	' isjunk for sequence matching')
	171	difflib.SequenceMatcher.__init__(self, isjunk, a, b)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	172
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	173	def _check_with_diff(self, alo, ahi, blo, bhi, answer):
	174	"""Use the original diff algorithm on an unmatched section.
	175
	176	This will check to make sure the range is worth checking,
	177	before doing any work.
	178
	179	:param alo: The last line that actually matched
	180	:param ahi: The next line that actually matches
	181	:param blo: Same as alo, only for the 'b' set
	182	:param bhi: Same as ahi
	183	:param answer: An array which will have the new ranges appended to it
	184	:return: None
	185	"""
	186	# WORKAROUND
	187	# recurse_matches has an implementation design
	188	# which does not match non-unique lines in the
	189	# if they do not touch matching unique lines
	190	# so we rerun the regular diff algorithm
	191	# if find a large enough chunk.
	192
	193	# recurse_matches already looked at the direct
	194	# neighbors, so we only need to run if there is
	195	# enough space to do so
	196	if ahi - alo > 2 and bhi - blo > 2:
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	197	a = self.a[alo+1:ahi-1]
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	198	b = self.b[blo+1:bhi-1]
	199	m = difflib.SequenceMatcher(None, a, b)
	200	new_blocks = m.get_matching_blocks()
	201	# difflib always adds a final match
	202	new_blocks.pop()
	203	for blk in new_blocks:
	204	answer.append((blk[0]+alo+1,
	205	blk[1]+blo+1,
	206	blk[2]))
	207
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	208	def __helper(self, alo, ahi, blo, bhi, answer):
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	209	matches = []
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	210	a = self.a[alo:ahi]
	211	b = self.b[blo:bhi]
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	212	recurse_matches(a, b, len(a), len(b), matches, 10)
	213	# Matches now has individual line pairs of
	214	# line A matches line B, at the given offsets
	215
	216	start_a = start_b = None
	217	length = 0
	218	for i_a, i_b in matches:
	219	if (start_a is not None
	220	and (i_a == start_a + length)
	221	and (i_b == start_b + length)):
	222	length += 1
	223	else:
	224	# New block
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	225	if start_a is None:
	226	# We need to check from 0,0 until the current match
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	227	self._check_with_diff(alo-1, i_a+alo, blo-1, i_b+blo,
	228	answer)
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	229	else:
1185.81.4 by John Arbash Meinel moved the logic deeper into difflib.	230	answer.append((start_a+alo, start_b+blo, length))
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	231	self._check_with_diff(start_a+alo+length, i_a+alo,
	232	start_b+blo+length, i_b+blo,
	233	answer)
1185.81.10 by John Arbash Meinel Added some more test cases.	234
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	235	start_a = i_a
	236	start_b = i_b
	237	length = 1
	238
	239	if length != 0:
1185.81.11 by John Arbash Meinel Found some edge cases that weren't being matched.	240	answer.append((start_a+alo, start_b+blo, length))
	241	self._check_with_diff(start_a+alo+length, ahi+1,
	242	start_b+blo+length, bhi+1,
	243	answer)
	244	if not matches:
	245	# Nothing matched, so we need to send the complete text
	246	self._check_with_diff(alo-1, ahi+1, blo-1, bhi+1, answer)
1185.81.1 by John Arbash Meinel Adding nofrillsprecisemerge's diff algorithm, wrapped in difflib.	247
1185.81.16 by John Arbash Meinel Added tests, and an assert check to make sure ranges are always increasing.	248	# For consistency sake, make sure all matches are only increasing
	249	if __debug__:
	250	next_a = -1
	251	next_b = -1
	252	for a,b,match_len in answer:
	253	assert a >= next_a, 'Non increasing matches for a'
	254	assert b >= next_b, 'Not increasing matches for b'
	255	next_a = a + match_len
	256	next_b = b + match_len
	257
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	258
1185.81.8 by John Arbash Meinel Updating unified_diff to take a factory, using the new diff algorithm in the code.	259	# This is a version of unified_diff which only adds a factory parameter
	260	# so that you can override the default SequenceMatcher
	261	# this has been submitted as a patch to python
	262	def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
	263	tofiledate='', n=3, lineterm='\n',
	264	sequencematcher=None):
	265	r"""
	266	Compare two sequences of lines; generate the delta as a unified diff.
	267
	268	Unified diffs are a compact way of showing line changes and a few
	269	lines of context. The number of context lines is set by 'n' which
	270	defaults to three.
	271
	272	By default, the diff control lines (those with ---, +++, or @@) are
	273	created with a trailing newline. This is helpful so that inputs
	274	created from file.readlines() result in diffs that are suitable for
	275	file.writelines() since both the inputs and outputs have trailing
	276	newlines.
	277
	278	For inputs that do not have trailing newlines, set the lineterm
	279	argument to "" so that the output will be uniformly newline free.
	280
	281	The unidiff format normally has a header for filenames and modification
	282	times. Any or all of these may be specified using strings for
	283	'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification
	284	times are normally expressed in the format returned by time.ctime().
	285
	286	Example:
	287
	288	>>> for line in unified_diff('one two three four'.split(),
	289	... 'zero one tree four'.split(), 'Original', 'Current',
	290	... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
	291	... lineterm=''):
	292	... print line
	293	--- Original Sat Jan 26 23:30:50 1991
	294	+++ Current Fri Jun 06 10:20:52 2003
	295	@@ -1,4 +1,4 @@
	296	+zero
	297	one
	298	-two
	299	-three
	300	+tree
	301	four
	302	"""
	303	if sequencematcher is None:
	304	sequencematcher = difflib.SequenceMatcher
	305
	306	started = False
	307	for group in sequencematcher(None,a,b).get_grouped_opcodes(n):
	308	if not started:
	309	yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
	310	yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
	311	started = True
	312	i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
	313	yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
	314	for tag, i1, i2, j1, j2 in group:
	315	if tag == 'equal':
	316	for line in a[i1:i2]:
	317	yield ' ' + line
	318	continue
	319	if tag == 'replace' or tag == 'delete':
	320	for line in a[i1:i2]:
	321	yield '-' + line
	322	if tag == 'replace' or tag == 'insert':
323	for line in b[j1:j2]:
324	yield '+' + line
325
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	326
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	327	def unified_diff_files(a, b, sequencematcher=None):
	328	"""Generate the diff for two files.
	329	"""
	330	# Should this actually be an error?
	331	if a == b:
	332	return []
	333	if a == '-':
	334	file_a = sys.stdin
	335	time_a = time.time()
	336	else:
	337	file_a = open(a, 'rb')
	338	time_a = os.stat(a).st_mtime
	339
	340	if b == '-':
	341	file_b = sys.stdin
	342	time_b = time.time()
	343	else:
	344	file_b = open(b, 'rb')
	345	time_b = os.stat(b).st_mtime
	346
	347	# TODO: Include fromfiledate and tofiledate
	348	return unified_diff(file_a.readlines(), file_b.readlines(),
	349	fromfile=a, tofile=b,
	350	sequencematcher=sequencematcher)
	351
1185.81.29 by Aaron Bentley Fix style issues and duplicated tests	352
1185.81.14 by John Arbash Meinel Added a main function for running cdvdifflib manually, included tests for unified_diff interfaces	353	def main(args):
	354	import optparse
	355	p = optparse.OptionParser(usage='%prog [options] file_a file_b'
	356	'\nFiles can be "-" to read from stdin')
	357	p.add_option('--cdv', dest='matcher', action='store_const', const='cdv',
	358	default='cdv', help='Use the cdv difference algorithm')
	359	p.add_option('--difflib', dest='matcher', action='store_const', const='difflib',
	360	default='cdv', help='Use python\'s difflib algorithm')
	361
	362	algorithms = {'cdv':SequenceMatcher, 'difflib':difflib.SequenceMatcher}
	363
	364	(opts, args) = p.parse_args(args)
	365	matcher = algorithms[opts.matcher]
	366
	367	if len(args) != 2:
	368	print 'You must supply 2 filenames to diff'
	369	return -1
	370
	371	for line in unified_diff_files(args[0], args[1], sequencematcher=matcher):
	372	sys.stdout.write(line)
	373
	374	if __name__ == '__main__':
	375	sys.exit(main(sys.argv[1:]))