22
22
# importing this module is fairly slow because it has to load several
25
from bzrlib.serializer import Serializer
26
from bzrlib.trace import mutter, warning
28
import xml.etree.cElementTree as elementtree
29
from xml.etree.ElementTree import ParseError
30
# it's in this package in python2.5
31
from xml.etree.cElementTree import (ElementTree, SubElement, Element,
32
XMLTreeBuilder, fromstring, tostring)
33
import xml.etree as elementtree
35
from cElementTree import (ElementTree, SubElement, Element,
36
XMLTreeBuilder, fromstring, tostring)
37
import elementtree.ElementTree
38
ParseError = SyntaxError
30
39
except ImportError:
31
# Fall back to pure python implementation if C extension is unavailable
32
import xml.etree.ElementTree as elementtree
34
from xml.etree.ElementTree import ParseError
36
from xml.parsers.expat import ExpatError as ParseError
38
(ElementTree, SubElement, Element, fromstringlist, tostringlist, tostring,
40
elementtree.ElementTree, elementtree.SubElement, elementtree.Element,
41
elementtree.fromstringlist, elementtree.tostringlist, elementtree.tostring,
42
elementtree.fromstring)
56
class XMLSerializer(serializer.Serializer):
40
mutter('WARNING: using slower ElementTree; consider installing cElementTree'
41
" and make sure it's on your PYTHONPATH")
42
# this copy is shipped with bzr
43
from util.elementtree.ElementTree import (ElementTree, SubElement,
44
Element, XMLTreeBuilder,
46
import util.elementtree as elementtree
47
from xml.parsers.expat import ExpatError as ParseError
49
from bzrlib import errors
52
class XMLSerializer(Serializer):
57
53
"""Abstract XML object serialize/deserialize"""
59
55
squashes_xml_invalid_characters = True
61
def read_inventory_from_lines(self, lines, revision_id=None,
62
entry_cache=None, return_from_cache=False):
57
def read_inventory_from_string(self, xml_string, revision_id=None,
63
59
"""Read xml_string into an inventory object.
65
:param chunks: The xml to read.
61
:param xml_string: The xml to read.
66
62
:param revision_id: If not-None, the expected revision id of the
67
63
inventory. Some serialisers use this to set the results' root
68
64
revision. This should be supplied for deserialising all
104
95
def read_revision_from_string(self, xml_string):
105
96
return self._unpack_revision(fromstring(xml_string))
98
def _write_element(self, elt, f):
99
ElementTree(elt).write(f, 'utf-8')
107
102
def _read_element(self, f):
108
103
return ElementTree().parse(f)
106
# performance tuning for elementree's serialiser. This should be
107
# sent upstream - RBC 20060523.
108
# the functions here are patched into elementtree at runtime.
110
escape_re = re.compile("[&'\"<>]")
113
"'":"'", # FIXME: overkill
118
def _escape_replace(match, map=escape_map):
119
return map[match.group()]
121
def _escape_attrib(text, encoding=None, replace=None):
122
# escape attribute value
126
text = elementtree.ElementTree._encode(text, encoding)
128
return elementtree.ElementTree._encode_entity(text)
130
return escape_re.sub(_escape_replace, text)
132
text = replace(text, "&", "&")
133
text = replace(text, "'", "'") # FIXME: overkill
134
text = replace(text, "\"", """)
135
text = replace(text, "<", "<")
136
text = replace(text, ">", ">")
138
except (TypeError, AttributeError):
139
elementtree.ElementTree._raise_serialization_error(text)
141
elementtree.ElementTree._escape_attrib = _escape_attrib
143
escape_cdata_re = re.compile("[&<>]")
149
def _escape_cdata_replace(match, map=escape_cdata_map):
150
return map[match.group()]
152
def _escape_cdata(text, encoding=None, replace=None):
153
# escape character data
157
text = elementtree.ElementTree._encode(text, encoding)
159
return elementtree.ElementTree._encode_entity(text)
161
return escape_cdata_re.sub(_escape_cdata_replace, text)
163
text = replace(text, "&", "&")
164
text = replace(text, "<", "<")
165
text = replace(text, ">", ">")
167
except (TypeError, AttributeError):
168
elementtree.ElementTree._raise_serialization_error(text)
170
elementtree.ElementTree._escape_cdata = _escape_cdata
111
173
def escape_invalid_chars(message):
112
174
"""Escape the XML-invalid characters in a commit message.
121
183
# aren't listed in the XML specification
122
184
# (http://www.w3.org/TR/REC-xml/#NT-Char).
123
185
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
124
lambda match: match.group(0).encode(
125
'unicode_escape').decode('ascii'),
129
def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
130
"""Return a cached version of the string.
132
cElementTree will return a plain string if the XML is plain ascii. It only
133
returns Unicode when it needs to. We want to work in utf-8 strings. So if
134
cElementTree returns a plain string, we can just return the cached version.
135
If it is Unicode, then we need to encode it.
137
:param a_str: An 8-bit string or Unicode as returned by
138
cElementTree.Element.get()
139
:return: A utf-8 encoded 8-bit string.
141
# This is fairly optimized because we know what cElementTree does, this is
142
# not meant as a generic function for all cases. Because it is possible for
143
# an 8-bit string to not be ascii or valid utf8.
144
if a_str.__class__ is str:
145
return _encode_utf8(a_str)
150
_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')
151
_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
156
"'": "'", # FIXME: overkill
163
def _unicode_escape_replace(match, _map=_xml_escape_map):
164
"""Replace a string of non-ascii, non XML safe characters with their escape
166
This will escape both Standard XML escapes, like <>"', etc.
167
As well as escaping non ascii characters, because ElementTree did.
168
This helps us remain compatible to older versions of bzr. We may change
169
our policy in the future, though.
171
# jam 20060816 Benchmarks show that try/KeyError is faster if you
172
# expect the entity to rarely miss. There is about a 10% difference
173
# in overall time. But if you miss frequently, then if None is much
174
# faster. For our use case, we *rarely* have a revision id, file id
175
# or path name that is unicode. So use try/KeyError.
177
return _map[match.group()]
179
return "&#%d;" % ord(match.group())
182
def _utf8_escape_replace(match, _map=_xml_escape_map):
183
"""Escape utf8 characters into XML safe ones.
185
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
186
or it is handling characters with the high-bit set. For ascii characters,
187
we just lookup the replacement in the dictionary. For everything else, we
188
decode back into Unicode, and then use the XML escape code.
191
return _map[match.group().decode('ascii', 'replace')].encode()
193
return b''.join(b'&#%d;' % ord(uni_chr)
194
for uni_chr in match.group().decode('utf8'))
200
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
201
"""Encode the string into utf8, and escape invalid XML characters"""
202
# We frequently get entities we have not seen before, so it is better
203
# to check if None, rather than try/KeyError
204
text = _map.get(unicode_or_utf8_str)
206
if isinstance(unicode_or_utf8_str, str):
207
# The alternative policy is to do a regular UTF8 encoding
208
# and then escape only XML meta characters.
209
# Performance is equivalent once you use cache_utf8. *However*
210
# this makes the serialized texts incompatible with old versions
211
# of bzr. So no net gain. (Perhaps the read code would handle utf8
212
# better than entity escapes, but cElementTree seems to do just
214
text = _unicode_re.sub(
215
_unicode_escape_replace, unicode_or_utf8_str).encode()
217
# Plain strings are considered to already be in utf-8 so we do a
218
# slightly different method for escaping.
219
text = _utf8_re.sub(_utf8_escape_replace,
221
_map[unicode_or_utf8_str] = text
226
"""Clean out the unicode => escaped map"""
227
_to_escaped_map.clear()
230
def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
232
file_id = elt_get('file_id')
233
revision = elt_get('revision')
234
# Check and see if we have already unpacked this exact entry
235
# Some timings for "repo.revision_trees(last_100_revs)"
237
# unmodified 4.1s 40.8s
239
# using fifo 2.83s 29.1s
243
# no_copy 2.00s 20.5s
244
# no_c,dict 1.95s 18.0s
245
# Note that a cache of 10k nodes is more than sufficient to hold all of
246
# the inventory for the last 100 revs for bzr, but not for mysql (20k
247
# is enough for mysql, which saves the same 2s as using a dict)
249
# Breakdown of mysql using time.clock()
250
# 4.1s 2 calls to element.get for file_id, revision_id
251
# 4.5s cache_hit lookup
252
# 7.1s InventoryFile.copy()
253
# 2.4s InventoryDirectory.copy()
254
# 0.4s decoding unique entries
255
# 1.6s decoding entries after FIFO fills up
256
# 0.8s Adding nodes to FIFO (including flushes)
257
# 0.1s cache miss lookups
259
# 4.1s 2 calls to element.get for file_id, revision_id
260
# 9.9s cache_hit lookup
261
# 10.8s InventoryEntry.copy()
262
# 0.3s cache miss lookus
263
# 1.2s decoding entries
264
# 1.0s adding nodes to LRU
265
if entry_cache is not None and revision is not None:
266
key = (file_id, revision)
268
# We copy it, because some operations may mutate it
269
cached_ie = entry_cache[key]
273
# Only copying directory entries drops us 2.85s => 2.35s
274
if return_from_cache:
275
if cached_ie.kind == 'directory':
276
return cached_ie.copy()
278
return cached_ie.copy()
281
if not inventory.InventoryEntry.versionable_kind(kind):
282
raise AssertionError('unsupported entry kind %s' % kind)
284
file_id = get_utf8_or_ascii(file_id)
285
if revision is not None:
286
revision = get_utf8_or_ascii(revision)
287
parent_id = elt_get('parent_id')
288
if parent_id is not None:
289
parent_id = get_utf8_or_ascii(parent_id)
291
if kind == 'directory':
292
ie = inventory.InventoryDirectory(file_id,
296
ie = inventory.InventoryFile(file_id,
299
ie.text_sha1 = elt_get('text_sha1')
300
if ie.text_sha1 is not None:
301
ie.text_sha1 = ie.text_sha1.encode('ascii')
302
if elt_get('executable') == 'yes':
304
v = elt_get('text_size')
305
ie.text_size = v and int(v)
306
elif kind == 'symlink':
307
ie = inventory.InventoryLink(file_id,
310
ie.symlink_target = elt_get('symlink_target')
311
elif kind == 'tree-reference':
312
file_id = get_utf8_or_ascii(elt.attrib['file_id'])
313
name = elt.attrib['name']
314
parent_id = get_utf8_or_ascii(elt.attrib['parent_id'])
315
revision = get_utf8_or_ascii(elt.get('revision'))
316
reference_revision = get_utf8_or_ascii(elt.get('reference_revision'))
317
ie = inventory.TreeReference(file_id, name, parent_id, revision,
320
raise serializer.UnsupportedInventoryKind(kind)
321
ie.revision = revision
322
if revision is not None and entry_cache is not None:
323
# We cache a copy() because callers like to mutate objects, and
324
# that would cause the item in cache to mutate as well.
325
# This has a small effect on many-inventory performance, because
326
# the majority fraction is spent in cache hits, not misses.
327
entry_cache[key] = ie.copy()
332
def unpack_inventory_flat(elt, format_num, unpack_entry,
333
entry_cache=None, return_from_cache=False):
334
"""Unpack a flat XML inventory.
336
:param elt: XML element for the inventory
337
:param format_num: Expected format number
338
:param unpack_entry: Function for unpacking inventory entries
339
:return: An inventory
340
:raise UnexpectedInventoryFormat: When unexpected elements or data is
343
if elt.tag != 'inventory':
344
raise serializer.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
345
format = elt.get('format')
346
if ((format is None and format_num is not None) or
347
format.encode() != format_num):
348
raise serializer.UnexpectedInventoryFormat('Invalid format version %r'
350
revision_id = elt.get('revision_id')
351
if revision_id is not None:
352
revision_id = cache_utf8.encode(revision_id)
353
inv = inventory.Inventory(root_id=None, revision_id=revision_id)
355
ie = unpack_entry(e, entry_cache, return_from_cache)
360
def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
361
"""Serialize an inventory to a flat XML file.
363
:param inv: Inventory to serialize
364
:param append: Function for writing a line of output
365
:param working: If True skip history data - text_sha1, text_size,
366
reference_revision, symlink_target. self._check_revisions(inv)
368
entries = inv.iter_entries()
370
root_path, root_ie = next(entries)
371
for path, ie in entries:
372
if ie.parent_id != root_id:
373
parent_str = b''.join(
374
[b' parent_id="', encode_and_escape(ie.parent_id), b'"'])
377
if ie.kind == 'file':
379
executable = b' executable="yes"'
383
append(b'<file%s file_id="%s" name="%s"%s revision="%s" '
384
b'text_sha1="%s" text_size="%d" />\n' % (
385
executable, encode_and_escape(ie.file_id),
386
encode_and_escape(ie.name), parent_str,
387
encode_and_escape(ie.revision), ie.text_sha1,
390
append(b'<file%s file_id="%s" name="%s"%s />\n' % (
391
executable, encode_and_escape(ie.file_id),
392
encode_and_escape(ie.name), parent_str))
393
elif ie.kind == 'directory':
395
append(b'<directory file_id="%s" name="%s"%s revision="%s" '
397
encode_and_escape(ie.file_id),
398
encode_and_escape(ie.name),
400
encode_and_escape(ie.revision)))
402
append(b'<directory file_id="%s" name="%s"%s />\n' % (
403
encode_and_escape(ie.file_id),
404
encode_and_escape(ie.name),
406
elif ie.kind == 'symlink':
408
append(b'<symlink file_id="%s" name="%s"%s revision="%s" '
409
b'symlink_target="%s" />\n' % (
410
encode_and_escape(ie.file_id),
411
encode_and_escape(ie.name),
413
encode_and_escape(ie.revision),
414
encode_and_escape(ie.symlink_target)))
416
append(b'<symlink file_id="%s" name="%s"%s />\n' % (
417
encode_and_escape(ie.file_id),
418
encode_and_escape(ie.name),
420
elif ie.kind == 'tree-reference':
421
if ie.kind not in supported_kinds:
422
raise serializer.UnsupportedInventoryKind(ie.kind)
424
append(b'<tree-reference file_id="%s" name="%s"%s '
425
b'revision="%s" reference_revision="%s" />\n' % (
426
encode_and_escape(ie.file_id),
427
encode_and_escape(ie.name),
429
encode_and_escape(ie.revision),
430
encode_and_escape(ie.reference_revision)))
432
append(b'<tree-reference file_id="%s" name="%s"%s />\n' % (
433
encode_and_escape(ie.file_id),
434
encode_and_escape(ie.name),
437
raise serializer.UnsupportedInventoryKind(ie.kind)
438
append(b'</inventory>\n')
186
lambda match: match.group(0).encode('unicode_escape'),