40
43
def _ensure_utf8_re():
41
"""Make sure the _utf8_re regex has been compiled"""
43
if _utf8_re is not None:
45
_utf8_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
48
def _utf8_escape_replace(match, _map=_utf8_escape_map):
44
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
45
global _utf8_re, _unicode_re
47
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
48
if _unicode_re is None:
49
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
52
def _unicode_escape_replace(match, _map=_xml_escape_map):
49
53
"""Replace a string of non-ascii, non XML safe characters with their escape
51
55
This will escape both Standard XML escapes, like <>"', etc.
64
68
return "&#%d;" % ord(match.group())
67
_unicode_to_escaped_map = {}
69
def _encode_and_escape(unicode_str, _map=_unicode_to_escaped_map):
71
def _utf8_escape_replace(match, _map=_xml_escape_map):
72
"""Escape utf8 characters into XML safe ones.
74
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
75
or it is handling characters with the high-bit set. For ascii characters,
76
we just lookup the replacement in the dictionary. For everything else, we
77
decode back into Unicode, and then use the XML escape code.
80
return _map[match.group()]
82
return ''.join('&#%d;' % ord(uni_chr)
83
for uni_chr in match.group().decode('utf8'))
88
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
70
89
"""Encode the string into utf8, and escape invalid XML characters"""
71
90
# We frequently get entities we have not seen before, so it is better
72
91
# to check if None, rather than try/KeyError
73
text = _map.get(unicode_str)
92
text = _map.get(unicode_or_utf8_str)
75
# The alternative policy is to do a regular UTF8 encoding
76
# and then escape only XML meta characters.
77
# Performance is equivalent once you use cache_utf8. *However*
78
# this makes the serialized texts incompatible with old versions
79
# of bzr. So no net gain. (Perhaps the read code would handle utf8
80
# better than entity escapes, but cElementTree seems to do just fine
82
text = str(_utf8_re.sub(_utf8_escape_replace, unicode_str)) + '"'
83
_map[unicode_str] = text
94
if unicode_or_utf8_str.__class__ == unicode:
95
# The alternative policy is to do a regular UTF8 encoding
96
# and then escape only XML meta characters.
97
# Performance is equivalent once you use cache_utf8. *However*
98
# this makes the serialized texts incompatible with old versions
99
# of bzr. So no net gain. (Perhaps the read code would handle utf8
100
# better than entity escapes, but cElementTree seems to do just fine
102
text = str(_unicode_re.sub(_unicode_escape_replace,
103
unicode_or_utf8_str)) + '"'
105
# Plain strings are considered to already be in utf-8 so we do a
106
# slightly different method for escaping.
107
text = _utf8_re.sub(_utf8_escape_replace,
108
unicode_or_utf8_str) + '"'
109
_map[unicode_or_utf8_str] = text
113
def _get_utf8_or_ascii(a_str,
114
_encode_utf8=cache_utf8.encode,
115
_get_cached_ascii=cache_utf8.get_cached_ascii):
116
"""Return a cached version of the string.
118
cElementTree will return a plain string if the XML is plain ascii. It only
119
returns Unicode when it needs to. We want to work in utf-8 strings. So if
120
cElementTree returns a plain string, we can just return the cached version.
121
If it is Unicode, then we need to encode it.
123
:param a_str: An 8-bit string or Unicode as returned by
124
cElementTree.Element.get()
125
:return: A utf-8 encoded 8-bit string.
127
# This is fairly optimized because we know what cElementTree does, this is
128
# not meant as a generic function for all cases. Because it is possible for
129
# an 8-bit string to not be ascii or valid utf8.
130
if a_str.__class__ == unicode:
131
return _encode_utf8(a_str)
133
return _get_cached_ascii(a_str)
87
136
def _clear_cache():
88
137
"""Clean out the unicode => escaped map"""
89
_unicode_to_escaped_map.clear()
138
_to_escaped_map.clear()
92
141
class Serializer_v5(Serializer):
179
233
def _pack_revision(self, rev):
180
234
"""Revision object -> xml tree"""
235
# For the XML format, we need to write them as Unicode rather than as
236
# utf-8 strings. So that cElementTree can handle properly escaping
238
decode_utf8 = cache_utf8.decode
239
revision_id = rev.revision_id
240
if isinstance(revision_id, str):
241
revision_id = decode_utf8(revision_id)
181
242
root = Element('revision',
182
243
committer = rev.committer,
183
timestamp = '%.9f' % rev.timestamp,
184
revision_id = rev.revision_id,
244
timestamp = '%.3f' % rev.timestamp,
245
revision_id = revision_id,
185
246
inventory_sha1 = rev.inventory_sha1,
196
257
pelts.tail = pelts.text = '\n'
197
258
for parent_id in rev.parent_ids:
198
259
assert isinstance(parent_id, basestring)
260
_mod_revision.check_not_reserved_id(parent_id)
199
261
p = SubElement(pelts, 'revision_ref')
263
if isinstance(parent_id, str):
264
parent_id = decode_utf8(parent_id)
201
265
p.set('revision_id', parent_id)
202
266
if rev.properties:
203
267
self._pack_revision_properties(rev, root)
227
293
revision_id = elt.get('revision_id')
228
294
if revision_id is not None:
229
revision_id = cache_utf8.get_cached_unicode(revision_id)
295
revision_id = cache_utf8.encode(revision_id)
230
296
inv = Inventory(root_id, revision_id=revision_id)
232
298
ie = self._unpack_entry(e)
233
if ie.parent_id == ROOT_ID:
299
if ie.parent_id is None:
234
300
ie.parent_id = root_id
238
def _unpack_entry(self, elt, none_parents=False):
304
def _unpack_entry(self, elt):
240
306
if not InventoryEntry.versionable_kind(kind):
241
307
raise AssertionError('unsupported entry kind %s' % kind)
243
get_cached = cache_utf8.get_cached_unicode
309
get_cached = _get_utf8_or_ascii
245
311
parent_id = elt.get('parent_id')
246
if parent_id is None and not none_parents:
248
# TODO: jam 20060817 At present, caching file ids costs us too
249
# much time. It slows down overall read performances from
250
# approx 500ms to 700ms. And doesn't improve future reads.
251
# it might be because revision ids and file ids are mixing.
252
# Consider caching *just* the file ids, for a limited period
254
#parent_id = get_cached(parent_id)
255
#file_id = get_cached(elt.get('file_id'))
256
file_id = elt.get('file_id')
312
if parent_id is not None:
313
parent_id = get_cached(parent_id)
314
file_id = get_cached(elt.get('file_id'))
258
316
if kind == 'directory':
259
317
ie = inventory.InventoryDirectory(file_id,