/brz/remove-bazaar : contents of bzrlib/cache_utf8.py at revision 2255.2.196

: (revision 2255.2.196)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# Copyright (C) 2006 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# TODO: Some kind of command-line display of revision properties: 
# perhaps show them in log -v and allow them as options to the commit command.

"""Some functions to enable caching the conversion between unicode to utf8"""

import codecs


_utf8_encode = codecs.getencoder("utf-8")
_utf8_decode = codecs.getdecoder("utf-8")
# wrap _utf8_decode to support None->None for optional strings.
def _utf8_decode_with_None(bytestring, _utf8_decode=_utf8_decode):
    if bytestring is None:
        return (None, 0)
    else:
        return _utf8_decode(bytestring)

# Map revisions from and to utf8 encoding
# Whenever we do an encode/decode operation, we save the result, so that
# we don't have to do it again.
_unicode_to_utf8_map = {}
_utf8_to_unicode_map = {}


def encode(unicode_str,
           _uni_to_utf8=_unicode_to_utf8_map,
           _utf8_to_uni=_utf8_to_unicode_map,
           _utf8_encode=_utf8_encode):
    """Take this unicode revision id, and get a unicode version"""
    # If the key is in the cache try/KeyError is 50% faster than
    # val = dict.get(key), if val is None:
    # On jam's machine the difference is 
    # try/KeyError:  900ms 
    #      if None: 1250ms 
    # Since these are primarily used when iterating over a knit entry
    # *most* of the time the key will already be in the cache, so use the
    # fast path
    try:
        return _uni_to_utf8[unicode_str]
    except KeyError:
        _uni_to_utf8[unicode_str] = utf8_str = _utf8_encode(unicode_str)[0]
        _utf8_to_uni[utf8_str] = unicode_str
        return utf8_str


def decode(utf8_str,
           _uni_to_utf8=_unicode_to_utf8_map,
           _utf8_to_uni=_utf8_to_unicode_map,
           _utf8_decode=_utf8_decode):
    """Take a utf8 revision id, and decode it, but cache the result"""
    try:
        return _utf8_to_uni[utf8_str]
    except KeyError:
        unicode_str = _utf8_decode(utf8_str)[0]
        _utf8_to_uni[utf8_str] = unicode_str
        _uni_to_utf8[unicode_str] = utf8_str
        return unicode_str


def get_cached_unicode(unicode_str):
    """Return a cached version of the unicode string.

    This has a similar idea to that of intern() in that it tries
    to return a singleton string. Only it works for unicode strings.
    """
    # This might return the same object, or it might return the cached one
    # the decode() should just be a hash lookup, because the encode() side
    # should add the entry to the maps
    return decode(encode(unicode_str))


def get_cached_utf8(utf8_str):
    """Return a cached version of the utf-8 string.

    Get a cached version of this string (similar to intern()).
    At present, this will be decoded to ensure it is a utf-8 string. In the
    future this might change to simply caching the string.
    """
    return encode(decode(utf8_str))


def get_cached_ascii(ascii_str,
                     _uni_to_utf8=_unicode_to_utf8_map,
                     _utf8_to_uni=_utf8_to_unicode_map):
    """This is a string which is identical in utf-8 and unicode."""
    # We don't need to do any encoding, but we want _utf8_to_uni to return a
    # real Unicode string. Unicode and plain strings of this type will have the
    # same hash, so we can just use it as the key in _uni_to_utf8, but we need
    # the return value to be different in _utf8_to_uni
    ascii_str = _uni_to_utf8.setdefault(ascii_str, ascii_str)
    _utf8_to_uni.setdefault(ascii_str, unicode(ascii_str))
    return ascii_str


def clear_encoding_cache():
    """Clear the encoding and decoding caches"""
    _unicode_to_utf8_map.clear()
    _utf8_to_unicode_map.clear()

2052.3.1 by John Arbash Meinel Add tests to cleanup the copyright of all source files	1	# Copyright (C) 2006 Canonical Ltd
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	# TODO: Some kind of command-line display of revision properties:
	18	# perhaps show them in log -v and allow them as options to the commit command.
	19
	20	"""Some functions to enable caching the conversion between unicode to utf8"""
	21
2155.1.1 by John Arbash Meinel (Dmitry Vasiliev) pre-lookup encoders to improve performance	22	import codecs
	23
	24
	25	_utf8_encode = codecs.getencoder("utf-8")
	26	_utf8_decode = codecs.getdecoder("utf-8")
2255.7.95 by Robert Collins Add convenience utf8 decode routine for handling strings that might be None	27	# wrap _utf8_decode to support None->None for optional strings.
	28	def _utf8_decode_with_None(bytestring, _utf8_decode=_utf8_decode):
	29	if bytestring is None:
	30	return (None, 0)
	31	else:
	32	return _utf8_decode(bytestring)
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	33
	34	# Map revisions from and to utf8 encoding
	35	# Whenever we do an encode/decode operation, we save the result, so that
	36	# we don't have to do it again.
	37	_unicode_to_utf8_map = {}
	38	_utf8_to_unicode_map = {}
	39
	40
	41	def encode(unicode_str,
	42	_uni_to_utf8=_unicode_to_utf8_map,
2155.1.1 by John Arbash Meinel (Dmitry Vasiliev) pre-lookup encoders to improve performance	43	_utf8_to_uni=_utf8_to_unicode_map,
	44	_utf8_encode=_utf8_encode):
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	45	"""Take this unicode revision id, and get a unicode version"""
1934.1.11 by John Arbash Meinel Document why we use try/except rather than if None	46	# If the key is in the cache try/KeyError is 50% faster than
	47	# val = dict.get(key), if val is None:
	48	# On jam's machine the difference is
	49	# try/KeyError: 900ms
	50	# if None: 1250ms
	51	# Since these are primarily used when iterating over a knit entry
	52	# most of the time the key will already be in the cache, so use the
	53	# fast path
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	54	try:
	55	return _uni_to_utf8[unicode_str]
	56	except KeyError:
2155.1.1 by John Arbash Meinel (Dmitry Vasiliev) pre-lookup encoders to improve performance	57	_uni_to_utf8[unicode_str] = utf8_str = _utf8_encode(unicode_str)[0]
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	58	_utf8_to_uni[utf8_str] = unicode_str
	59	return utf8_str
	60
	61
	62	def decode(utf8_str,
	63	_uni_to_utf8=_unicode_to_utf8_map,
2155.1.1 by John Arbash Meinel (Dmitry Vasiliev) pre-lookup encoders to improve performance	64	_utf8_to_uni=_utf8_to_unicode_map,
	65	_utf8_decode=_utf8_decode):
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	66	"""Take a utf8 revision id, and decode it, but cache the result"""
	67	try:
	68	return _utf8_to_uni[utf8_str]
	69	except KeyError:
2249.5.12 by John Arbash Meinel Change the APIs for VersionedFile, Store, and some of Repository into utf-8	70	unicode_str = _utf8_decode(utf8_str)[0]
	71	_utf8_to_uni[utf8_str] = unicode_str
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	72	_uni_to_utf8[unicode_str] = utf8_str
	73	return unicode_str
	74
	75
1911.2.5 by John Arbash Meinel Update cache tests, add a function to do something like intern() only for unicode objects	76	def get_cached_unicode(unicode_str):
	77	"""Return a cached version of the unicode string.
	78
	79	This has a similar idea to that of intern() in that it tries
	80	to return a singleton string. Only it works for unicode strings.
	81	"""
	82	# This might return the same object, or it might return the cached one
	83	# the decode() should just be a hash lookup, because the encode() side
	84	# should add the entry to the maps
	85	return decode(encode(unicode_str))
	86
	87
2249.5.2 by John Arbash Meinel Add a get_cached_utf8, which will ensure it is really utf8, and cache the strings	88	def get_cached_utf8(utf8_str):
	89	"""Return a cached version of the utf-8 string.
	90
	91	Get a cached version of this string (similar to intern()).
	92	At present, this will be decoded to ensure it is a utf-8 string. In the
	93	future this might change to simply caching the string.
	94	"""
	95	return encode(decode(utf8_str))
	96
	97
2249.5.3 by John Arbash Meinel Add get_cached_ascii for dealing with how cElementTree handles ascii strings	98	def get_cached_ascii(ascii_str,
	99	_uni_to_utf8=_unicode_to_utf8_map,
	100	_utf8_to_uni=_utf8_to_unicode_map):
	101	"""This is a string which is identical in utf-8 and unicode."""
	102	# We don't need to do any encoding, but we want _utf8_to_uni to return a
	103	# real Unicode string. Unicode and plain strings of this type will have the
	104	# same hash, so we can just use it as the key in _uni_to_utf8, but we need
	105	# the return value to be different in _utf8_to_uni
	106	ascii_str = _uni_to_utf8.setdefault(ascii_str, ascii_str)
	107	_utf8_to_uni.setdefault(ascii_str, unicode(ascii_str))
	108	return ascii_str
	109
	110
1911.2.3 by John Arbash Meinel Moving everything into a new location so that we can cache more than just revision ids	111	def clear_encoding_cache():
	112	"""Clear the encoding and decoding caches"""
	113	_unicode_to_utf8_map.clear()
	114	_utf8_to_unicode_map.clear()