130
134
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
131
135
lambda match: match.group(0).encode('unicode_escape'),
139
def get_utf8_or_ascii(a_str,
140
_encode_utf8=cache_utf8.encode,
141
_get_cached_ascii=cache_utf8.get_cached_ascii):
142
"""Return a cached version of the string.
144
cElementTree will return a plain string if the XML is plain ascii. It only
145
returns Unicode when it needs to. We want to work in utf-8 strings. So if
146
cElementTree returns a plain string, we can just return the cached version.
147
If it is Unicode, then we need to encode it.
149
:param a_str: An 8-bit string or Unicode as returned by
150
cElementTree.Element.get()
151
:return: A utf-8 encoded 8-bit string.
153
# This is fairly optimized because we know what cElementTree does, this is
154
# not meant as a generic function for all cases. Because it is possible for
155
# an 8-bit string to not be ascii or valid utf8.
156
if a_str.__class__ is unicode:
157
return _encode_utf8(a_str)
162
_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')
163
_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
168
"'":"'", # FIXME: overkill
175
def _unicode_escape_replace(match, _map=_xml_escape_map):
176
"""Replace a string of non-ascii, non XML safe characters with their escape
178
This will escape both Standard XML escapes, like <>"', etc.
179
As well as escaping non ascii characters, because ElementTree did.
180
This helps us remain compatible to older versions of bzr. We may change
181
our policy in the future, though.
183
# jam 20060816 Benchmarks show that try/KeyError is faster if you
184
# expect the entity to rarely miss. There is about a 10% difference
185
# in overall time. But if you miss frequently, then if None is much
186
# faster. For our use case, we *rarely* have a revision id, file id
187
# or path name that is unicode. So use try/KeyError.
189
return _map[match.group()]
191
return "&#%d;" % ord(match.group())
194
def _utf8_escape_replace(match, _map=_xml_escape_map):
195
"""Escape utf8 characters into XML safe ones.
197
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
198
or it is handling characters with the high-bit set. For ascii characters,
199
we just lookup the replacement in the dictionary. For everything else, we
200
decode back into Unicode, and then use the XML escape code.
203
return _map[match.group()]
205
return ''.join('&#%d;' % ord(uni_chr)
206
for uni_chr in match.group().decode('utf8'))
211
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
212
"""Encode the string into utf8, and escape invalid XML characters"""
213
# We frequently get entities we have not seen before, so it is better
214
# to check if None, rather than try/KeyError
215
text = _map.get(unicode_or_utf8_str)
217
if unicode_or_utf8_str.__class__ is unicode:
218
# The alternative policy is to do a regular UTF8 encoding
219
# and then escape only XML meta characters.
220
# Performance is equivalent once you use cache_utf8. *However*
221
# this makes the serialized texts incompatible with old versions
222
# of bzr. So no net gain. (Perhaps the read code would handle utf8
223
# better than entity escapes, but cElementTree seems to do just fine
225
text = str(_unicode_re.sub(_unicode_escape_replace,
226
unicode_or_utf8_str)) + '"'
228
# Plain strings are considered to already be in utf-8 so we do a
229
# slightly different method for escaping.
230
text = _utf8_re.sub(_utf8_escape_replace,
231
unicode_or_utf8_str) + '"'
232
_map[unicode_or_utf8_str] = text
237
"""Clean out the unicode => escaped map"""
238
_to_escaped_map.clear()