1
# Copyright (C) 2005, 2006, 2007, 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
revision as _mod_revision,
28
from bzrlib.xml_serializer import SubElement, Element, Serializer
29
from bzrlib.inventory import ROOT_ID, Inventory, InventoryEntry
30
from bzrlib.revision import Revision
31
from bzrlib.errors import BzrError
38
"'":"'", # FIXME: overkill
43
_entry_cache = fifo_cache.FIFOCache(10*1024)
46
def _ensure_utf8_re():
47
"""Make sure the _utf8_re and _unicode_re regexes have been compiled."""
48
global _utf8_re, _unicode_re
50
_utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
51
if _unicode_re is None:
52
_unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
55
def _unicode_escape_replace(match, _map=_xml_escape_map):
56
"""Replace a string of non-ascii, non XML safe characters with their escape
58
This will escape both Standard XML escapes, like <>"', etc.
59
As well as escaping non ascii characters, because ElementTree did.
60
This helps us remain compatible to older versions of bzr. We may change
61
our policy in the future, though.
63
# jam 20060816 Benchmarks show that try/KeyError is faster if you
64
# expect the entity to rarely miss. There is about a 10% difference
65
# in overall time. But if you miss frequently, then if None is much
66
# faster. For our use case, we *rarely* have a revision id, file id
67
# or path name that is unicode. So use try/KeyError.
69
return _map[match.group()]
71
return "&#%d;" % ord(match.group())
74
def _utf8_escape_replace(match, _map=_xml_escape_map):
75
"""Escape utf8 characters into XML safe ones.
77
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
78
or it is handling characters with the high-bit set. For ascii characters,
79
we just lookup the replacement in the dictionary. For everything else, we
80
decode back into Unicode, and then use the XML escape code.
83
return _map[match.group()]
85
return ''.join('&#%d;' % ord(uni_chr)
86
for uni_chr in match.group().decode('utf8'))
91
def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
92
"""Encode the string into utf8, and escape invalid XML characters"""
93
# We frequently get entities we have not seen before, so it is better
94
# to check if None, rather than try/KeyError
95
text = _map.get(unicode_or_utf8_str)
97
if unicode_or_utf8_str.__class__ == unicode:
98
# The alternative policy is to do a regular UTF8 encoding
99
# and then escape only XML meta characters.
100
# Performance is equivalent once you use cache_utf8. *However*
101
# this makes the serialized texts incompatible with old versions
102
# of bzr. So no net gain. (Perhaps the read code would handle utf8
103
# better than entity escapes, but cElementTree seems to do just fine
105
text = str(_unicode_re.sub(_unicode_escape_replace,
106
unicode_or_utf8_str)) + '"'
108
# Plain strings are considered to already be in utf-8 so we do a
109
# slightly different method for escaping.
110
text = _utf8_re.sub(_utf8_escape_replace,
111
unicode_or_utf8_str) + '"'
112
_map[unicode_or_utf8_str] = text
116
def _get_utf8_or_ascii(a_str,
117
_encode_utf8=cache_utf8.encode,
118
_get_cached_ascii=cache_utf8.get_cached_ascii):
119
"""Return a cached version of the string.
121
cElementTree will return a plain string if the XML is plain ascii. It only
122
returns Unicode when it needs to. We want to work in utf-8 strings. So if
123
cElementTree returns a plain string, we can just return the cached version.
124
If it is Unicode, then we need to encode it.
126
:param a_str: An 8-bit string or Unicode as returned by
127
cElementTree.Element.get()
128
:return: A utf-8 encoded 8-bit string.
130
# This is fairly optimized because we know what cElementTree does, this is
131
# not meant as a generic function for all cases. Because it is possible for
132
# an 8-bit string to not be ascii or valid utf8.
133
if a_str.__class__ == unicode:
134
return _encode_utf8(a_str)
136
return _get_cached_ascii(a_str)
140
"""Clean out the unicode => escaped map"""
141
_to_escaped_map.clear()
144
class Serializer_v8(Serializer):
145
"""This serialiser adds rich roots.
147
Its revision format number matches its inventory number.
153
support_altered_by_hack = True
154
# This format supports the altered-by hack that reads file ids directly out
155
# of the versionedfile, without doing XML parsing.
157
supported_kinds = set(['file', 'directory', 'symlink'])
159
revision_format_num = None
161
def _check_revisions(self, inv):
162
"""Extension point for subclasses to check during serialisation.
164
:param inv: An inventory about to be serialised, to be checked.
165
:raises: AssertionError if an error has occured.
167
if inv.revision_id is None:
168
raise AssertionError()
169
if inv.root.revision is None:
170
raise AssertionError()
172
def _check_cache_size(self, inv_size, entry_cache):
173
"""Check that the entry_cache is large enough.
175
We want the cache to be ~2x the size of an inventory. The reason is
176
because we use a FIFO cache, and how Inventory records are likely to
177
change. In general, you have a small number of records which change
178
often, and a lot of records which do not change at all. So when the
179
cache gets full, you actually flush out a lot of the records you are
180
interested in, which means you need to recreate all of those records.
181
An LRU Cache would be better, but the overhead negates the cache
184
One way to look at it, only the size of the cache > len(inv) is your
185
'working' set. And in general, it shouldn't be a problem to hold 2
186
inventories in memory anyway.
188
:param inv_size: The number of entries in an inventory.
190
if entry_cache is None:
192
# 1.5 times might also be reasonable.
193
recommended_min_cache_size = inv_size * 1.5
194
if entry_cache.cache_size() < recommended_min_cache_size:
195
recommended_cache_size = inv_size * 2
196
trace.mutter('Resizing the inventory entry cache from %d to %d',
197
entry_cache.cache_size(), recommended_cache_size)
198
entry_cache.resize(recommended_cache_size)
200
def write_inventory_to_lines(self, inv):
201
"""Return a list of lines with the encoded inventory."""
202
return self.write_inventory(inv, None)
204
def write_inventory_to_string(self, inv, working=False):
205
"""Just call write_inventory with a StringIO and return the value.
207
:param working: If True skip history data - text_sha1, text_size,
208
reference_revision, symlink_target.
210
sio = cStringIO.StringIO()
211
self.write_inventory(inv, sio, working)
212
return sio.getvalue()
214
def write_inventory(self, inv, f, working=False):
215
"""Write inventory to a file.
217
:param inv: the inventory to write.
218
:param f: the file to write. (May be None if the lines are the desired
220
:param working: If True skip history data - text_sha1, text_size,
221
reference_revision, symlink_target.
222
:return: The inventory as a list of lines.
225
self._check_revisions(inv)
227
append = output.append
228
self._append_inventory_root(append, inv)
229
entries = inv.iter_entries()
231
root_path, root_ie = entries.next()
232
for path, ie in entries:
233
if ie.parent_id != self.root_id:
234
parent_str = ' parent_id="'
235
parent_id = _encode_and_escape(ie.parent_id)
239
if ie.kind == 'file':
241
executable = ' executable="yes"'
245
append('<file%s file_id="%s name="%s%s%s revision="%s '
246
'text_sha1="%s" text_size="%d" />\n' % (
247
executable, _encode_and_escape(ie.file_id),
248
_encode_and_escape(ie.name), parent_str, parent_id,
249
_encode_and_escape(ie.revision), ie.text_sha1,
252
append('<file%s file_id="%s name="%s%s%s />\n' % (
253
executable, _encode_and_escape(ie.file_id),
254
_encode_and_escape(ie.name), parent_str, parent_id))
255
elif ie.kind == 'directory':
257
append('<directory file_id="%s name="%s%s%s revision="%s '
259
_encode_and_escape(ie.file_id),
260
_encode_and_escape(ie.name),
261
parent_str, parent_id,
262
_encode_and_escape(ie.revision)))
264
append('<directory file_id="%s name="%s%s%s />\n' % (
265
_encode_and_escape(ie.file_id),
266
_encode_and_escape(ie.name),
267
parent_str, parent_id))
268
elif ie.kind == 'symlink':
270
append('<symlink file_id="%s name="%s%s%s revision="%s '
271
'symlink_target="%s />\n' % (
272
_encode_and_escape(ie.file_id),
273
_encode_and_escape(ie.name),
274
parent_str, parent_id,
275
_encode_and_escape(ie.revision),
276
_encode_and_escape(ie.symlink_target)))
278
append('<symlink file_id="%s name="%s%s%s />\n' % (
279
_encode_and_escape(ie.file_id),
280
_encode_and_escape(ie.name),
281
parent_str, parent_id))
282
elif ie.kind == 'tree-reference':
283
if ie.kind not in self.supported_kinds:
284
raise errors.UnsupportedInventoryKind(ie.kind)
286
append('<tree-reference file_id="%s name="%s%s%s '
287
'revision="%s reference_revision="%s />\n' % (
288
_encode_and_escape(ie.file_id),
289
_encode_and_escape(ie.name),
290
parent_str, parent_id,
291
_encode_and_escape(ie.revision),
292
_encode_and_escape(ie.reference_revision)))
294
append('<tree-reference file_id="%s name="%s%s%s />\n' % (
295
_encode_and_escape(ie.file_id),
296
_encode_and_escape(ie.name),
297
parent_str, parent_id))
299
raise errors.UnsupportedInventoryKind(ie.kind)
300
append('</inventory>\n')
303
# Just to keep the cache from growing without bounds
304
# but we may actually not want to do clear the cache
308
def _append_inventory_root(self, append, inv):
309
"""Append the inventory root to output."""
310
if inv.revision_id is not None:
311
revid1 = ' revision_id="'
312
revid2 = _encode_and_escape(inv.revision_id)
316
append('<inventory format="%s"%s%s>\n' % (
317
self.format_num, revid1, revid2))
318
append('<directory file_id="%s name="%s revision="%s />\n' % (
319
_encode_and_escape(inv.root.file_id),
320
_encode_and_escape(inv.root.name),
321
_encode_and_escape(inv.root.revision)))
323
def _pack_revision(self, rev):
324
"""Revision object -> xml tree"""
325
# For the XML format, we need to write them as Unicode rather than as
326
# utf-8 strings. So that cElementTree can handle properly escaping
328
decode_utf8 = cache_utf8.decode
329
revision_id = rev.revision_id
330
if isinstance(revision_id, str):
331
revision_id = decode_utf8(revision_id)
332
format_num = self.format_num
333
if self.revision_format_num is not None:
334
format_num = self.revision_format_num
335
root = Element('revision',
336
committer = rev.committer,
337
timestamp = '%.3f' % rev.timestamp,
338
revision_id = revision_id,
339
inventory_sha1 = rev.inventory_sha1,
342
if rev.timezone is not None:
343
root.set('timezone', str(rev.timezone))
345
msg = SubElement(root, 'message')
346
msg.text = rev.message
349
pelts = SubElement(root, 'parents')
350
pelts.tail = pelts.text = '\n'
351
for parent_id in rev.parent_ids:
352
_mod_revision.check_not_reserved_id(parent_id)
353
p = SubElement(pelts, 'revision_ref')
355
if isinstance(parent_id, str):
356
parent_id = decode_utf8(parent_id)
357
p.set('revision_id', parent_id)
359
self._pack_revision_properties(rev, root)
362
def _pack_revision_properties(self, rev, under_element):
363
top_elt = SubElement(under_element, 'properties')
364
for prop_name, prop_value in sorted(rev.properties.items()):
365
prop_elt = SubElement(top_elt, 'property')
366
prop_elt.set('name', prop_name)
367
prop_elt.text = prop_value
371
def _unpack_inventory(self, elt, revision_id=None, entry_cache=None):
372
"""Construct from XML Element"""
373
if elt.tag != 'inventory':
374
raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
375
format = elt.get('format')
376
if format != self.format_num:
377
raise errors.UnexpectedInventoryFormat('Invalid format version %r'
379
if entry_cache is None:
380
entry_cache = _entry_cache
381
revision_id = elt.get('revision_id')
382
if revision_id is not None:
383
revision_id = cache_utf8.encode(revision_id)
384
inv = inventory.Inventory(root_id=None, revision_id=revision_id)
386
ie = self._unpack_entry(e, entry_cache=entry_cache)
388
self._check_cache_size(len(inv), entry_cache)
391
def _unpack_entry(self, elt, entry_cache=None):
393
file_id = elt_get('file_id')
394
revision = elt_get('revision')
395
# Check and see if we have already unpacked this exact entry
396
# Some timings for "repo.revision_trees(last_100_revs)"
398
# unmodified 4.1s 40.8s
400
# using fifo 2.83s 29.1s
404
# no_copy 2.00s 20.5s
405
# no_c,dict 1.95s 18.0s
406
# Note that a cache of 10k nodes is more than sufficient to hold all of
407
# the inventory for the last 100 revs for bzr, but not for mysql (20k
408
# is enough for mysql, which saves the same 2s as using a dict)
410
# Breakdown of mysql using time.clock()
411
# 4.1s 2 calls to element.get for file_id, revision_id
412
# 4.5s cache_hit lookup
413
# 7.1s InventoryFile.copy()
414
# 2.4s InventoryDirectory.copy()
415
# 0.4s decoding unique entries
416
# 1.6s decoding entries after FIFO fills up
417
# 0.8s Adding nodes to FIFO (including flushes)
418
# 0.1s cache miss lookups
420
# 4.1s 2 calls to element.get for file_id, revision_id
421
# 9.9s cache_hit lookup
422
# 10.8s InventoryEntry.copy()
423
# 0.3s cache miss lookus
424
# 1.2s decoding entries
425
# 1.0s adding nodes to LRU
426
if entry_cache is not None and revision is not None:
427
key = (file_id, revision)
429
# We copy it, because some operatations may mutate it
430
cached_ie = entry_cache[key]
434
# Only copying directory entries drops us 2.85s => 2.35s
435
# if cached_ie.kind == 'directory':
436
# return cached_ie.copy()
438
return cached_ie.copy()
441
if not InventoryEntry.versionable_kind(kind):
442
raise AssertionError('unsupported entry kind %s' % kind)
444
get_cached = _get_utf8_or_ascii
446
file_id = get_cached(file_id)
447
if revision is not None:
448
revision = get_cached(revision)
449
parent_id = elt_get('parent_id')
450
if parent_id is not None:
451
parent_id = get_cached(parent_id)
453
if kind == 'directory':
454
ie = inventory.InventoryDirectory(file_id,
458
ie = inventory.InventoryFile(file_id,
461
ie.text_sha1 = elt_get('text_sha1')
462
if elt_get('executable') == 'yes':
464
v = elt_get('text_size')
465
ie.text_size = v and int(v)
466
elif kind == 'symlink':
467
ie = inventory.InventoryLink(file_id,
470
ie.symlink_target = elt_get('symlink_target')
472
raise errors.UnsupportedInventoryKind(kind)
473
ie.revision = revision
474
if revision is not None and entry_cache is not None:
475
# We cache a copy() because callers like to mutate objects, and
476
# that would cause the item in cache to mutate as well.
477
# This has a small effect on many-inventory performance, because
478
# the majority fraction is spent in cache hits, not misses.
479
entry_cache[key] = ie.copy()
483
def _unpack_revision(self, elt):
484
"""XML Element -> Revision object"""
485
format = elt.get('format')
486
format_num = self.format_num
487
if self.revision_format_num is not None:
488
format_num = self.revision_format_num
489
if format is not None:
490
if format != format_num:
491
raise BzrError("invalid format version %r on revision"
493
get_cached = _get_utf8_or_ascii
494
rev = Revision(committer = elt.get('committer'),
495
timestamp = float(elt.get('timestamp')),
496
revision_id = get_cached(elt.get('revision_id')),
497
inventory_sha1 = elt.get('inventory_sha1')
499
parents = elt.find('parents') or []
501
rev.parent_ids.append(get_cached(p.get('revision_id')))
502
self._unpack_revision_properties(elt, rev)
503
v = elt.get('timezone')
507
rev.timezone = int(v)
508
rev.message = elt.findtext('message') # text of <message>
511
def _unpack_revision_properties(self, elt, rev):
512
"""Unpack properties onto a revision."""
513
props_elt = elt.find('properties')
516
for prop_elt in props_elt:
517
if prop_elt.tag != 'property':
518
raise AssertionError(
519
"bad tag under properties list: %r" % prop_elt.tag)
520
name = prop_elt.get('name')
521
value = prop_elt.text
522
# If a property had an empty value ('') cElementTree reads
523
# that back as None, convert it back to '', so that all
524
# properties have string values
527
if name in rev.properties:
528
raise AssertionError("repeated property %r" % name)
529
rev.properties[name] = value
532
serializer_v8 = Serializer_v8()