bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
2052.3.1
by John Arbash Meinel
 Add tests to cleanup the copyright of all source files  | 
1  | 
# Copyright (C) 2006 Canonical Ltd
 | 
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
2  | 
#
 | 
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
4  | 
# it under the terms of the GNU General Public License as published by
 | 
|
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
7  | 
#
 | 
|
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
#
 | 
|
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
15  | 
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
|
16  | 
||
17  | 
# TODO: Some kind of command-line display of revision properties: 
 | 
|
18  | 
# perhaps show them in log -v and allow them as options to the commit command.
 | 
|
19  | 
||
20  | 
"""Some functions to enable caching the conversion between unicode to utf8"""
 | 
|
21  | 
||
| 
2155.1.1
by John Arbash Meinel
 (Dmitry Vasiliev) pre-lookup encoders to improve performance  | 
22  | 
import codecs  | 
23  | 
||
24  | 
||
25  | 
_utf8_encode = codecs.getencoder("utf-8")  | 
|
26  | 
_utf8_decode = codecs.getdecoder("utf-8")  | 
|
| 
2255.7.95
by Robert Collins
 Add convenience utf8 decode routine for handling strings that might be None  | 
27  | 
# wrap _utf8_decode to support None->None for optional strings.
 | 
28  | 
def _utf8_decode_with_None(bytestring, _utf8_decode=_utf8_decode):  | 
|
29  | 
if bytestring is None:  | 
|
30  | 
return (None, 0)  | 
|
31  | 
else:  | 
|
32  | 
return _utf8_decode(bytestring)  | 
|
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
33  | 
|
34  | 
# Map revisions from and to utf8 encoding
 | 
|
35  | 
# Whenever we do an encode/decode operation, we save the result, so that
 | 
|
36  | 
# we don't have to do it again.
 | 
|
37  | 
_unicode_to_utf8_map = {}  | 
|
38  | 
_utf8_to_unicode_map = {}  | 
|
39  | 
||
40  | 
||
41  | 
def encode(unicode_str,  | 
|
42  | 
_uni_to_utf8=_unicode_to_utf8_map,  | 
|
| 
2155.1.1
by John Arbash Meinel
 (Dmitry Vasiliev) pre-lookup encoders to improve performance  | 
43  | 
_utf8_to_uni=_utf8_to_unicode_map,  | 
44  | 
_utf8_encode=_utf8_encode):  | 
|
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
45  | 
"""Take this unicode revision id, and get a unicode version"""  | 
| 
1934.1.11
by John Arbash Meinel
 Document why we use try/except rather than if None  | 
46  | 
    # If the key is in the cache try/KeyError is 50% faster than
 | 
47  | 
    # val = dict.get(key), if val is None:
 | 
|
48  | 
    # On jam's machine the difference is 
 | 
|
49  | 
    # try/KeyError:  900ms 
 | 
|
50  | 
    #      if None: 1250ms 
 | 
|
51  | 
    # Since these are primarily used when iterating over a knit entry
 | 
|
52  | 
    # *most* of the time the key will already be in the cache, so use the
 | 
|
53  | 
    # fast path
 | 
|
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
54  | 
try:  | 
55  | 
return _uni_to_utf8[unicode_str]  | 
|
56  | 
except KeyError:  | 
|
| 
2155.1.1
by John Arbash Meinel
 (Dmitry Vasiliev) pre-lookup encoders to improve performance  | 
57  | 
_uni_to_utf8[unicode_str] = utf8_str = _utf8_encode(unicode_str)[0]  | 
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
58  | 
_utf8_to_uni[utf8_str] = unicode_str  | 
59  | 
return utf8_str  | 
|
60  | 
||
61  | 
||
62  | 
def decode(utf8_str,  | 
|
63  | 
_uni_to_utf8=_unicode_to_utf8_map,  | 
|
| 
2155.1.1
by John Arbash Meinel
 (Dmitry Vasiliev) pre-lookup encoders to improve performance  | 
64  | 
_utf8_to_uni=_utf8_to_unicode_map,  | 
65  | 
_utf8_decode=_utf8_decode):  | 
|
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
66  | 
"""Take a utf8 revision id, and decode it, but cache the result"""  | 
67  | 
try:  | 
|
68  | 
return _utf8_to_uni[utf8_str]  | 
|
69  | 
except KeyError:  | 
|
| 
2249.5.12
by John Arbash Meinel
 Change the APIs for VersionedFile, Store, and some of Repository into utf-8  | 
70  | 
unicode_str = _utf8_decode(utf8_str)[0]  | 
71  | 
_utf8_to_uni[utf8_str] = unicode_str  | 
|
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
72  | 
_uni_to_utf8[unicode_str] = utf8_str  | 
73  | 
return unicode_str  | 
|
74  | 
||
75  | 
||
| 
1911.2.5
by John Arbash Meinel
 Update cache tests, add a function to do something like intern() only for unicode objects  | 
76  | 
def get_cached_unicode(unicode_str):  | 
77  | 
"""Return a cached version of the unicode string.  | 
|
78  | 
||
79  | 
    This has a similar idea to that of intern() in that it tries
 | 
|
80  | 
    to return a singleton string. Only it works for unicode strings.
 | 
|
81  | 
    """
 | 
|
82  | 
    # This might return the same object, or it might return the cached one
 | 
|
83  | 
    # the decode() should just be a hash lookup, because the encode() side
 | 
|
84  | 
    # should add the entry to the maps
 | 
|
85  | 
return decode(encode(unicode_str))  | 
|
86  | 
||
87  | 
||
| 
2249.5.2
by John Arbash Meinel
 Add a get_cached_utf8, which will ensure it is really utf8, and cache the strings  | 
88  | 
def get_cached_utf8(utf8_str):  | 
89  | 
"""Return a cached version of the utf-8 string.  | 
|
90  | 
||
91  | 
    Get a cached version of this string (similar to intern()).
 | 
|
92  | 
    At present, this will be decoded to ensure it is a utf-8 string. In the
 | 
|
93  | 
    future this might change to simply caching the string.
 | 
|
94  | 
    """
 | 
|
95  | 
return encode(decode(utf8_str))  | 
|
96  | 
||
97  | 
||
| 
2249.5.3
by John Arbash Meinel
 Add get_cached_ascii for dealing with how cElementTree handles ascii strings  | 
98  | 
def get_cached_ascii(ascii_str,  | 
99  | 
_uni_to_utf8=_unicode_to_utf8_map,  | 
|
100  | 
_utf8_to_uni=_utf8_to_unicode_map):  | 
|
101  | 
"""This is a string which is identical in utf-8 and unicode."""  | 
|
102  | 
    # We don't need to do any encoding, but we want _utf8_to_uni to return a
 | 
|
103  | 
    # real Unicode string. Unicode and plain strings of this type will have the
 | 
|
104  | 
    # same hash, so we can just use it as the key in _uni_to_utf8, but we need
 | 
|
105  | 
    # the return value to be different in _utf8_to_uni
 | 
|
106  | 
ascii_str = _uni_to_utf8.setdefault(ascii_str, ascii_str)  | 
|
107  | 
_utf8_to_uni.setdefault(ascii_str, unicode(ascii_str))  | 
|
108  | 
return ascii_str  | 
|
109  | 
||
110  | 
||
| 
1911.2.3
by John Arbash Meinel
 Moving everything into a new location so that we can cache more than just revision ids  | 
111  | 
def clear_encoding_cache():  | 
112  | 
"""Clear the encoding and decoding caches"""  | 
|
113  | 
_unicode_to_utf8_map.clear()  | 
|
114  | 
_utf8_to_unicode_map.clear()  |