/brz/remove-bazaar : contents of breezy/utextwrap.py at revision 7490.7.6

: (revision 7490.7.6)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	1	# Copyright (C) 2011 Canonical Ltd
	2	#
5820.1.18 by INADA Naoki Add copyright for some function.	3	# UTextWrapper._handle_long_word, UTextWrapper._wrap_chunks,
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	4	# UTextWrapper._fix_sentence_endings, wrap and fill is copied from Python's
	5	# textwrap module (under PSF license) and modified for support CJK.
5820.1.18 by INADA Naoki Add copyright for some function.	6	# Original Copyright for these functions:
	7	#
	8	# Copyright (C) 1999-2001 Gregory P. Ward.
	9	# Copyright (C) 2002, 2003 Python Software Foundation.
	10	#
	11	# Written by Greg Ward <gward@python.net>
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	25
6379.6.3 by Jelmer Vernooij Use absolute_import.	26	from __future__ import absolute_import
6379.6.3 by Jelmer Vernooij Use absolute_import.	27
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	28	import textwrap
	29	from unicodedata import east_asian_width as _eawidth
	30
6624 by Jelmer Vernooĳ Merge Python3 porting work ('py3 pokes')	31	from . import osutils
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	32
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	33	__all__ = ["UTextWrapper", "fill", "wrap"]
	34
7143.15.2 by Jelmer Vernooĳ Run autopep8.	35
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	36	class UTextWrapper(textwrap.TextWrapper):
	37	"""
	38	Extend TextWrapper for Unicode.
	39
	40	This textwrapper handles east asian double width and split word
	41	even if !break_long_words when word contains double width
	42	characters.
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	43
	44	:param ambiguous_width: (keyword argument) width for character when
	45	unicodedata.east_asian_width(c) == 'A'
5820.1.21 by INADA Naoki Change default value of ambiguous_width from 2 to 1.	46	(default: 1)
5820.1.22 by INADA Naoki Add document of some limitations in docstring.	47
	48	Limitations:
	49	* expand_tabs doesn't fixed. It uses len() for calculating width
	50	of string on left of TAB.
	51	* Handles one codeunit as a single character having 1 or 2 width.
	52	This is not correct when there are surrogate pairs, combined
	53	characters or zero-width characters.
	54	* Treats all asian character are line breakable. But it is not
	55	true because line breaking is prohibited around some characters.
	56	(For example, breaking before punctation mark is prohibited.)
	57	See UAX # 14 "UNICODE LINE BREAKING ALGORITHM"
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	58	"""
5820.1.15 by Martin Cope with lack of TextWrapper.drop_whitespace before Python 2.6	59
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	60	def __init__(self, width=None, **kwargs):
	61	if width is None:
	62	width = (osutils.terminal_width() or
7143.15.2 by Jelmer Vernooĳ Run autopep8.	63	osutils.default_terminal_width) - 1
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	64
5820.1.21 by INADA Naoki Change default value of ambiguous_width from 2 to 1.	65	ambi_width = kwargs.pop('ambiguous_width', 1)
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	66	if ambi_width == 1:
	67	self._east_asian_doublewidth = 'FW'
	68	elif ambi_width == 2:
	69	self._east_asian_doublewidth = 'FWA'
	70	else:
	71	raise ValueError("ambiguous_width should be 1 or 2")
	72
7112.1.2 by Jelmer Vernooĳ Fix test on python2.	73	self.max_lines = kwargs.get('max_lines', None)
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	74	textwrap.TextWrapper.__init__(self, width, **kwargs)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	75
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	76	def _unicode_char_width(self, uc):
	77	"""Return width of character `uc`.
	78
	79	:param: uc Single unicode character.
	80	"""
	81	# 'A' means width of the character is not be able to determine.
	82	# We assume that it's width is 2 because longer wrap may over
	83	# terminal width but shorter wrap may be acceptable.
	84	return (_eawidth(uc) in self._east_asian_doublewidth and 2) or 1
	85
	86	def _width(self, s):
	87	"""Returns width for s.
5820.1.26 by INADA Naoki Cleanup. Remove spaces in empty line and shorten line having 80 characters.	88
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	89	When s is unicode, take care of east asian width.
	90	When s is bytes, treat all byte is single width character.
	91	"""
	92	charwidth = self._unicode_char_width
	93	return sum(charwidth(c) for c in s)
	94
	95	def _cut(self, s, width):
	96	"""Returns head and rest of s. (head+rest == s)
	97
	98	Head is large as long as _width(head) <= width.
	99	"""
	100	w = 0
	101	charwidth = self._unicode_char_width
	102	for pos, c in enumerate(s):
	103	w += charwidth(c)
	104	if w > width:
	105	return s[:pos], s[pos:]
	106	return s, u''
	107
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	108	def _fix_sentence_endings(self, chunks):
	109	"""_fix_sentence_endings(chunks : [string])
	110
	111	Correct for sentence endings buried in 'chunks'. Eg. when the
	112	original text contains "... foo.\nBar ...", munge_whitespace()
	113	and split() will convert that to [..., "foo.", " ", "Bar", ...]
	114	which has one too few spaces; this method simply changes the one
	115	space to two.
	116
	117	Note: This function is copied from textwrap.TextWrap and modified
	118	to use unicode always.
	119	"""
	120	i = 0
7143.15.2 by Jelmer Vernooĳ Run autopep8.	121	L = len(chunks) - 1
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	122	patsearch = self.sentence_end_re.search
	123	while i < L:
7143.15.2 by Jelmer Vernooĳ Run autopep8.	124	if chunks[i + 1] == u" " and patsearch(chunks[i]):
7143.15.2 by Jelmer Vernooĳ Run autopep8.	125	chunks[i + 1] = u" "
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	126	i += 2
	127	else:
	128	i += 1
	129
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	130	def _handle_long_word(self, chunks, cur_line, cur_len, width):
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	131	# Figure out when indent is larger than the specified width, and make
	132	# sure at least one character is stripped off on every pass
	133	if width < 2:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	134	space_left = chunks[-1] and self._width(chunks[-1][0]) or 1
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	135	else:
	136	space_left = width - cur_len
	137
	138	# If we're allowed to break long words, then do so: put as much
	139	# of the next chunk onto the current line as will fit.
	140	if self.break_long_words:
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	141	head, rest = self._cut(chunks[-1], space_left)
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	142	cur_line.append(head)
5820.1.5 by INADA Naoki Make UTextWrapper support byte string and add tests including Python's	143	if rest:
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	144	chunks[-1] = rest
	145	else:
	146	del chunks[-1]
	147
	148	# Otherwise, we have to preserve the long word intact. Only add
	149	# it to the current line if there's nothing already there --
	150	# that minimizes how much we violate the width constraint.
	151	elif not cur_line:
	152	cur_line.append(chunks.pop())
	153
	154	# If we're not allowed to break long words, and there's already
	155	# text on the current line, do nothing. Next time through the
	156	# main loop of _wrap_chunks(), we'll wind up here again, but
	157	# cur_len will be zero, so the next line will be entirely
	158	# devoted to the long word that we can't handle right now.
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	159
	160	def _wrap_chunks(self, chunks):
	161	lines = []
	162	if self.width <= 0:
	163	raise ValueError("invalid width %r (must be > 0)" % self.width)
7112.1.1 by Jelmer Vernooĳ Fix utextwrap tests on Python 3.5.	164	if self.max_lines is not None:
	165	if self.max_lines > 1:
	166	indent = self.subsequent_indent
	167	else:
	168	indent = self.initial_indent
	169	if self._width(indent) + self._width(self.placeholder.lstrip()) > self.width:
	170	raise ValueError("placeholder too large for max width")
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	171
	172	# Arrange in reverse order so items can be efficiently popped
	173	# from a stack of chucks.
	174	chunks.reverse()
	175
	176	while chunks:
	177
	178	# Start the list of chunks that will make up the current line.
	179	# cur_len is just the length of all the chunks in cur_line.
	180	cur_line = []
	181	cur_len = 0
	182
	183	# Figure out which static string will prefix this line.
	184	if lines:
	185	indent = self.subsequent_indent
	186	else:
	187	indent = self.initial_indent
	188
	189	# Maximum width for this line.
	190	width = self.width - len(indent)
	191
	192	# First chunk on line is whitespace -- drop it, unless this
	193	# is the very beginning of the text (ie. no lines started yet).
	194	if self.drop_whitespace and chunks[-1].strip() == '' and lines:
	195	del chunks[-1]
	196
	197	while chunks:
	198	# Use _width instead of len for east asian width
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	199	l = self._width(chunks[-1])
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	200
	201	# Can at least squeeze this chunk onto the current line.
	202	if cur_len + l <= width:
	203	cur_line.append(chunks.pop())
	204	cur_len += l
	205
	206	# Nope, this line is full.
	207	else:
	208	break
	209
	210	# The current line is full, and the next chunk is too big to
	211	# fit on any line (not just this one).
5820.1.19 by INADA Naoki Add keyword parameter 'ambiguous_width' that specifies width for character	212	if chunks and self._width(chunks[-1]) > width:
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	213	self._handle_long_word(chunks, cur_line, cur_len, width)
7112.1.1 by Jelmer Vernooĳ Fix utextwrap tests on Python 3.5.	214	cur_len = sum(map(len, cur_line))
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	215
	216	# If the last chunk on this line is all whitespace, drop it.
5820.1.26 by INADA Naoki Cleanup. Remove spaces in empty line and shorten line having 80 characters.	217	if self.drop_whitespace and cur_line and not cur_line[-1].strip():
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	218	del cur_line[-1]
	219
	220	# Convert current line back to a string and store it in list
	221	# of all lines (return value).
	222	if cur_line:
7112.1.1 by Jelmer Vernooĳ Fix utextwrap tests on Python 3.5.	223	if (self.max_lines is None or
	224	len(lines) + 1 < self.max_lines or
	225	(not chunks or
7143.15.2 by Jelmer Vernooĳ Run autopep8.	226	self.drop_whitespace and
7112.1.1 by Jelmer Vernooĳ Fix utextwrap tests on Python 3.5.	227	len(chunks) == 1 and
	228	not chunks[0].strip()) and cur_len <= width):
	229	# Convert current line back to a string and store it in
	230	# list of all lines (return value).
	231	lines.append(indent + u''.join(cur_line))
	232	else:
	233	while cur_line:
	234	if (cur_line[-1].strip() and
7143.15.2 by Jelmer Vernooĳ Run autopep8.	235	cur_len + self._width(self.placeholder) <= width):
7112.1.1 by Jelmer Vernooĳ Fix utextwrap tests on Python 3.5.	236	cur_line.append(self.placeholder)
	237	lines.append(indent + ''.join(cur_line))
	238	break
	239	cur_len -= self._width(cur_line[-1])
	240	del cur_line[-1]
	241	else:
	242	if lines:
	243	prev_line = lines[-1].rstrip()
	244	if (self._width(prev_line) + self._width(self.placeholder) <=
	245	self.width):
	246	lines[-1] = prev_line + self.placeholder
	247	break
	248	lines.append(indent + self.placeholder.lstrip())
	249	break
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	250
	251	return lines
	252
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	253	def _split(self, text):
7012.2.1 by Jelmer Vernooĳ Always pass in unicode to utextwrap.	254	chunks = textwrap.TextWrapper._split(self, osutils.safe_unicode(text))
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	255	cjk_split_chunks = []
	256	for chunk in chunks:
	257	prev_pos = 0
	258	for pos, char in enumerate(chunk):
5820.1.27 by INADA Naoki Fix error when fix_sentence_endings=True.	259	if self._unicode_char_width(char) == 2:
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	260	if prev_pos < pos:
	261	cjk_split_chunks.append(chunk[prev_pos:pos])
	262	cjk_split_chunks.append(char)
7143.15.2 by Jelmer Vernooĳ Run autopep8.	263	prev_pos = pos + 1
5820.1.10 by INADA Naoki utextwrap: Change a way to split between CJK characters.	264	if prev_pos < len(chunk):
	265	cjk_split_chunks.append(chunk[prev_pos:])
	266	return cjk_split_chunks
	267
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	268	def wrap(self, text):
	269	# ensure text is unicode
7112.1.2 by Jelmer Vernooĳ Fix test on python2.	270	return textwrap.TextWrapper.wrap(self, osutils.safe_unicode(text))
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	271
	272	# -- Convenience interface ---------------------------------------------
	273
7143.15.2 by Jelmer Vernooĳ Run autopep8.	274
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	275	def wrap(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	276	"""Wrap a single paragraph of text, returning a list of wrapped lines.
	277
	278	Reformat the single paragraph in 'text' so it fits in lines of no
	279	more than 'width' columns, and return a list of wrapped lines. By
	280	default, tabs in 'text' are expanded with string.expandtabs(), and
	281	all other whitespace characters (including newline) are converted to
	282	space. See TextWrapper class for available keyword args to customize
	283	wrapping behaviour.
	284	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	285	return UTextWrapper(width=width, **kwargs).wrap(text)
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	286
7143.15.2 by Jelmer Vernooĳ Run autopep8.	287
5820.1.2 by INADA Naoki bzrlib.utextwrap uses bzrlib.osutils.terminal_width() when width is not specified.	288	def fill(text, width=None, **kwargs):
5820.1.1 by INADA Naoki Add utextwrap that is same to textwrap but supports double width characters in east asia.	289	"""Fill a single paragraph of text, returning a new string.
	290
	291	Reformat the single paragraph in 'text' to fit in lines of no more
	292	than 'width' columns, and return a new string containing the entire
	293	wrapped paragraph. As with wrap(), tabs are expanded and other
	294	whitespace characters converted to space. See TextWrapper class for
	295	available keyword args to customize wrapping behaviour.
	296	"""
5820.1.9 by INADA Naoki Default width of UTextWrapper is also osutils.terminal_widtth() and	297	return UTextWrapper(width=width, **kwargs).fill(text)