22
22
# importing this module is fairly slow because it has to load several
25
from bzrlib.trace import mutter, warning
28
import xml.etree.cElementTree as elementtree
29
from xml.etree.ElementTree import ParseError
28
from cElementTree import (ElementTree, SubElement, Element,
29
XMLTreeBuilder, fromstring, tostring)
30
30
except ImportError:
31
# Fall back to pure python implementation if C extension is unavailable
32
import xml.etree.ElementTree as elementtree
34
from xml.etree.ElementTree import ParseError
36
from xml.parsers.expat import ExpatError as ParseError
38
(ElementTree, SubElement, Element, fromstringlist, tostringlist, tostring,
40
elementtree.ElementTree, elementtree.SubElement, elementtree.Element,
41
elementtree.fromstringlist, elementtree.tostringlist, elementtree.tostring,
42
elementtree.fromstring)
56
class XMLSerializer(serializer.Serializer):
57
"""Abstract XML object serialize/deserialize"""
59
squashes_xml_invalid_characters = True
61
def read_inventory_from_lines(self, lines, revision_id=None,
62
entry_cache=None, return_from_cache=False):
63
"""Read xml_string into an inventory object.
65
:param chunks: The xml to read.
66
:param revision_id: If not-None, the expected revision id of the
67
inventory. Some serialisers use this to set the results' root
68
revision. This should be supplied for deserialising all
69
from-repository inventories so that xml5 inventories that were
70
serialised without a revision identifier can be given the right
71
revision id (but not for working tree inventories where users can
72
edit the data without triggering checksum errors or anything).
73
:param entry_cache: An optional cache of InventoryEntry objects. If
74
supplied we will look up entries via (file_id, revision_id) which
75
should map to a valid InventoryEntry (File/Directory/etc) object.
76
:param return_from_cache: Return entries directly from the cache,
77
rather than copying them first. This is only safe if the caller
78
promises not to mutate the returned inventory entries, but it can
79
make some operations significantly faster.
82
return self._unpack_inventory(fromstringlist(lines), revision_id,
83
entry_cache=entry_cache,
84
return_from_cache=return_from_cache)
85
except ParseError as e:
86
raise serializer.UnexpectedInventoryFormat(str(e))
88
def read_inventory(self, f, revision_id=None):
91
return self._unpack_inventory(self._read_element(f),
95
except ParseError as e:
96
raise serializer.UnexpectedInventoryFormat(str(e))
31
mutter('WARNING: using slower ElementTree; consider installing cElementTree'
32
" and make sure it's on your PYTHONPATH")
33
from util.elementtree.ElementTree import (ElementTree, SubElement,
34
Element, XMLTreeBuilder,
37
from bzrlib.errors import BzrError
40
class Serializer(object):
41
"""Abstract object serialize/deserialize"""
42
def write_inventory(self, inv, f):
43
"""Write inventory to a file"""
44
elt = self._pack_inventory(inv)
45
self._write_element(elt, f)
47
def write_inventory_to_string(self, inv):
48
return tostring(self._pack_inventory(inv)) + '\n'
50
def read_inventory_from_string(self, xml_string):
51
return self._unpack_inventory(fromstring(xml_string))
53
def read_inventory(self, f):
54
return self._unpack_inventory(self._read_element(f))
56
def write_revision(self, rev, f):
57
self._write_element(self._pack_revision(rev), f)
98
59
def write_revision_to_string(self, rev):
99
return b''.join(self.write_revision_to_lines(rev))
60
return tostring(self._pack_revision(rev)) + '\n'
101
62
def read_revision(self, f):
102
63
return self._unpack_revision(self._read_element(f))
104
65
def read_revision_from_string(self, xml_string):
105
66
return self._unpack_revision(fromstring(xml_string))
68
def _write_element(self, elt, f):
69
ElementTree(elt).write(f, 'utf-8')
107
72
def _read_element(self, f):
108
73
return ElementTree().parse(f)
111
def escape_invalid_chars(message):
112
"""Escape the XML-invalid characters in a commit message.
114
:param message: Commit message to escape
115
:return: tuple with escaped message and number of characters escaped
119
# Python strings can include characters that can't be
120
# represented in well-formed XML; escape characters that
121
# aren't listed in the XML specification
122
# (http://www.w3.org/TR/REC-xml/#NT-Char).
123
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
124
lambda match: match.group(0).encode(
125
'unicode_escape').decode('ascii'),
129
def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
130
"""Return a cached version of the string.
132
cElementTree will return a plain string if the XML is plain ascii. It only
133
returns Unicode when it needs to. We want to work in utf-8 strings. So if
134
cElementTree returns a plain string, we can just return the cached version.
135
If it is Unicode, then we need to encode it.
137
:param a_str: An 8-bit string or Unicode as returned by
138
cElementTree.Element.get()
139
:return: A utf-8 encoded 8-bit string.
141
# This is fairly optimized because we know what cElementTree does, this is
142
# not meant as a generic function for all cases. Because it is possible for
143
# an 8-bit string to not be ascii or valid utf8.
144
if a_str.__class__ is str:
145
return _encode_utf8(a_str)
150
_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')
151
_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
156
"'": "'", # FIXME: overkill
163
def _unicode_escape_replace(match, _map=_xml_escape_map):
164
"""Replace a string of non-ascii, non XML safe characters with their escape
166
This will escape both Standard XML escapes, like <>"', etc.
167
As well as escaping non ascii characters, because ElementTree did.
168
This helps us remain compatible to older versions of bzr. We may change
169
our policy in the future, though.
171
# jam 20060816 Benchmarks show that try/KeyError is faster if you
172
# expect the entity to rarely miss. There is about a 10% difference
173
# in overall time. But if you miss frequently, then if None is much
174
# faster. For our use case, we *rarely* have a revision id, file id
175
# or path name that is unicode. So use try/KeyError.
177
return _map[match.group()]
179
return "&#%d;" % ord(match.group())
182
def _utf8_escape_replace(match, _map=_xml_escape_map):
183
"""Escape utf8 characters into XML safe ones.
185
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
186
or it is handling characters with the high-bit set. For ascii characters,
187
we just lookup the replacement in the dictionary. For everything else, we
188
decode back into Unicode, and then use the XML escape code.
191
return _map[match.group().decode('ascii', 'replace')].encode()
193
return b''.join(b'&#%d;' % ord(uni_chr)
194
for uni_chr in match.group().decode('utf8'))
200
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
201
"""Encode the string into utf8, and escape invalid XML characters"""
202
# We frequently get entities we have not seen before, so it is better
203
# to check if None, rather than try/KeyError
204
text = _map.get(unicode_or_utf8_str)
206
if isinstance(unicode_or_utf8_str, str):
207
# The alternative policy is to do a regular UTF8 encoding
208
# and then escape only XML meta characters.
209
# Performance is equivalent once you use cache_utf8. *However*
210
# this makes the serialized texts incompatible with old versions
211
# of bzr. So no net gain. (Perhaps the read code would handle utf8
212
# better than entity escapes, but cElementTree seems to do just
214
text = _unicode_re.sub(
215
_unicode_escape_replace, unicode_or_utf8_str).encode()
217
# Plain strings are considered to already be in utf-8 so we do a
218
# slightly different method for escaping.
219
text = _utf8_re.sub(_utf8_escape_replace,
221
_map[unicode_or_utf8_str] = text
226
"""Clean out the unicode => escaped map"""
227
_to_escaped_map.clear()
230
def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
232
file_id = elt_get('file_id')
233
revision = elt_get('revision')
234
# Check and see if we have already unpacked this exact entry
235
# Some timings for "repo.revision_trees(last_100_revs)"
237
# unmodified 4.1s 40.8s
239
# using fifo 2.83s 29.1s
243
# no_copy 2.00s 20.5s
244
# no_c,dict 1.95s 18.0s
245
# Note that a cache of 10k nodes is more than sufficient to hold all of
246
# the inventory for the last 100 revs for bzr, but not for mysql (20k
247
# is enough for mysql, which saves the same 2s as using a dict)
249
# Breakdown of mysql using time.clock()
250
# 4.1s 2 calls to element.get for file_id, revision_id
251
# 4.5s cache_hit lookup
252
# 7.1s InventoryFile.copy()
253
# 2.4s InventoryDirectory.copy()
254
# 0.4s decoding unique entries
255
# 1.6s decoding entries after FIFO fills up
256
# 0.8s Adding nodes to FIFO (including flushes)
257
# 0.1s cache miss lookups
259
# 4.1s 2 calls to element.get for file_id, revision_id
260
# 9.9s cache_hit lookup
261
# 10.8s InventoryEntry.copy()
262
# 0.3s cache miss lookus
263
# 1.2s decoding entries
264
# 1.0s adding nodes to LRU
265
if entry_cache is not None and revision is not None:
266
key = (file_id, revision)
268
# We copy it, because some operations may mutate it
269
cached_ie = entry_cache[key]
273
# Only copying directory entries drops us 2.85s => 2.35s
274
if return_from_cache:
275
if cached_ie.kind == 'directory':
276
return cached_ie.copy()
278
return cached_ie.copy()
281
if not inventory.InventoryEntry.versionable_kind(kind):
282
raise AssertionError('unsupported entry kind %s' % kind)
284
file_id = get_utf8_or_ascii(file_id)
285
if revision is not None:
286
revision = get_utf8_or_ascii(revision)
287
parent_id = elt_get('parent_id')
288
if parent_id is not None:
289
parent_id = get_utf8_or_ascii(parent_id)
291
if kind == 'directory':
292
ie = inventory.InventoryDirectory(file_id,
296
ie = inventory.InventoryFile(file_id,
299
ie.text_sha1 = elt_get('text_sha1')
300
if ie.text_sha1 is not None:
301
ie.text_sha1 = ie.text_sha1.encode('ascii')
302
if elt_get('executable') == 'yes':
304
v = elt_get('text_size')
305
ie.text_size = v and int(v)
306
elif kind == 'symlink':
307
ie = inventory.InventoryLink(file_id,
310
ie.symlink_target = elt_get('symlink_target')
311
elif kind == 'tree-reference':
312
file_id = get_utf8_or_ascii(elt.attrib['file_id'])
313
name = elt.attrib['name']
314
parent_id = get_utf8_or_ascii(elt.attrib['parent_id'])
315
revision = get_utf8_or_ascii(elt.get('revision'))
316
reference_revision = get_utf8_or_ascii(elt.get('reference_revision'))
317
ie = inventory.TreeReference(file_id, name, parent_id, revision,
320
raise serializer.UnsupportedInventoryKind(kind)
321
ie.revision = revision
322
if revision is not None and entry_cache is not None:
323
# We cache a copy() because callers like to mutate objects, and
324
# that would cause the item in cache to mutate as well.
325
# This has a small effect on many-inventory performance, because
326
# the majority fraction is spent in cache hits, not misses.
327
entry_cache[key] = ie.copy()
332
def unpack_inventory_flat(elt, format_num, unpack_entry,
333
entry_cache=None, return_from_cache=False):
334
"""Unpack a flat XML inventory.
336
:param elt: XML element for the inventory
337
:param format_num: Expected format number
338
:param unpack_entry: Function for unpacking inventory entries
339
:return: An inventory
340
:raise UnexpectedInventoryFormat: When unexpected elements or data is
343
if elt.tag != 'inventory':
344
raise serializer.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
345
format = elt.get('format')
346
if ((format is None and format_num is not None) or
347
format.encode() != format_num):
348
raise serializer.UnexpectedInventoryFormat('Invalid format version %r'
350
revision_id = elt.get('revision_id')
351
if revision_id is not None:
352
revision_id = cache_utf8.encode(revision_id)
353
inv = inventory.Inventory(root_id=None, revision_id=revision_id)
355
ie = unpack_entry(e, entry_cache, return_from_cache)
360
def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
361
"""Serialize an inventory to a flat XML file.
363
:param inv: Inventory to serialize
364
:param append: Function for writing a line of output
365
:param working: If True skip history data - text_sha1, text_size,
366
reference_revision, symlink_target. self._check_revisions(inv)
368
entries = inv.iter_entries()
370
root_path, root_ie = next(entries)
371
for path, ie in entries:
372
if ie.parent_id != root_id:
373
parent_str = b''.join(
374
[b' parent_id="', encode_and_escape(ie.parent_id), b'"'])
377
if ie.kind == 'file':
379
executable = b' executable="yes"'
383
append(b'<file%s file_id="%s" name="%s"%s revision="%s" '
384
b'text_sha1="%s" text_size="%d" />\n' % (
385
executable, encode_and_escape(ie.file_id),
386
encode_and_escape(ie.name), parent_str,
387
encode_and_escape(ie.revision), ie.text_sha1,
390
append(b'<file%s file_id="%s" name="%s"%s />\n' % (
391
executable, encode_and_escape(ie.file_id),
392
encode_and_escape(ie.name), parent_str))
393
elif ie.kind == 'directory':
395
append(b'<directory file_id="%s" name="%s"%s revision="%s" '
397
encode_and_escape(ie.file_id),
398
encode_and_escape(ie.name),
400
encode_and_escape(ie.revision)))
402
append(b'<directory file_id="%s" name="%s"%s />\n' % (
403
encode_and_escape(ie.file_id),
404
encode_and_escape(ie.name),
406
elif ie.kind == 'symlink':
408
append(b'<symlink file_id="%s" name="%s"%s revision="%s" '
409
b'symlink_target="%s" />\n' % (
410
encode_and_escape(ie.file_id),
411
encode_and_escape(ie.name),
413
encode_and_escape(ie.revision),
414
encode_and_escape(ie.symlink_target)))
416
append(b'<symlink file_id="%s" name="%s"%s />\n' % (
417
encode_and_escape(ie.file_id),
418
encode_and_escape(ie.name),
420
elif ie.kind == 'tree-reference':
421
if ie.kind not in supported_kinds:
422
raise serializer.UnsupportedInventoryKind(ie.kind)
424
append(b'<tree-reference file_id="%s" name="%s"%s '
425
b'revision="%s" reference_revision="%s" />\n' % (
426
encode_and_escape(ie.file_id),
427
encode_and_escape(ie.name),
429
encode_and_escape(ie.revision),
430
encode_and_escape(ie.reference_revision)))
432
append(b'<tree-reference file_id="%s" name="%s"%s />\n' % (
433
encode_and_escape(ie.file_id),
434
encode_and_escape(ie.name),
437
raise serializer.UnsupportedInventoryKind(ie.kind)
438
append(b'</inventory>\n')