bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
4763.2.4
by John Arbash Meinel
merge bzr.2.1 in preparation for NEWS entry. |
1 |
# Copyright (C) 2005-2010 Canonical Ltd
|
|
1887.1.1
by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines, |
2 |
#
|
|
1
by mbp at sourcefrog
import from baz patch-364 |
3 |
# This program is free software; you can redistribute it and/or modify
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
|
1887.1.1
by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines, |
7 |
#
|
|
1
by mbp at sourcefrog
import from baz patch-364 |
8 |
# This program is distributed in the hope that it will be useful,
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
|
1887.1.1
by Adeodato Simó
Do not separate paragraphs in the copyright statement with blank lines, |
12 |
#
|
|
1
by mbp at sourcefrog
import from baz patch-364 |
13 |
# You should have received a copy of the GNU General Public License
|
14 |
# along with this program; if not, write to the Free Software
|
|
|
4183.7.1
by Sabin Iacob
update FSF mailing address |
15 |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
1
by mbp at sourcefrog
import from baz patch-364 |
16 |
|
17 |
"""XML externalization support."""
|
|
18 |
||
|
48
by Martin Pool
witty comment |
19 |
# "XML is like violence: if it doesn't solve your problem, you aren't
|
20 |
# using enough of it." -- various
|
|
21 |
||
|
1180
by Martin Pool
- start splitting code for xml (de)serialization away from objects |
22 |
# importing this module is fairly slow because it has to load several
|
23 |
# ElementTree bits
|
|
24 |
||
|
5340.11.1
by Martin
Remove monkey patching of private ElementTree escaping functions entirely |
25 |
import re |
26 |
||
|
4237.3.1
by Jelmer Vernooij
Add new module with generic serializer information; keep XML-specific bits in |
27 |
from bzrlib.serializer import Serializer |
|
5121.2.4
by Jelmer Vernooij
Remove more unused imports. |
28 |
from bzrlib.trace import mutter |
|
1248
by Martin Pool
- new weave based cleanup [broken] |
29 |
|
|
802
by Martin Pool
- Remove XMLMixin class in favour of simple pack_xml, unpack_xml functions |
30 |
try: |
|
2039.2.1
by Martin Pool
Load python2.5's ElementTree if present |
31 |
try: |
32 |
# it's in this package in python2.5
|
|
33 |
from xml.etree.cElementTree import (ElementTree, SubElement, Element, |
|
34 |
XMLTreeBuilder, fromstring, tostring) |
|
35 |
import xml.etree as elementtree |
|
|
4797.66.1
by Martin
Prevent AttributeError in xml_serializer on certain cElementTree setups |
36 |
# Also import ElementTree module so monkey-patching below always works
|
37 |
import xml.etree.ElementTree |
|
|
2039.2.1
by Martin Pool
Load python2.5's ElementTree if present |
38 |
except ImportError: |
39 |
from cElementTree import (ElementTree, SubElement, Element, |
|
40 |
XMLTreeBuilder, fromstring, tostring) |
|
|
3475.1.2
by John Arbash Meinel
Fix missing import |
41 |
import elementtree.ElementTree |
|
2029.2.1
by Marien Zwart
Handle the different exception (non-c)ElementTree raises. |
42 |
ParseError = SyntaxError |
|
802
by Martin Pool
- Remove XMLMixin class in favour of simple pack_xml, unpack_xml functions |
43 |
except ImportError: |
|
1185.33.68
by Martin Pool
Emit warning to trace file only if using cElementTree. |
44 |
mutter('WARNING: using slower ElementTree; consider installing cElementTree' |
45 |
" and make sure it's on your PYTHONPATH") |
|
|
2039.2.1
by Martin Pool
Load python2.5's ElementTree if present |
46 |
# this copy is shipped with bzr
|
|
1227
by Martin Pool
- methods to deserialize objects from strings |
47 |
from util.elementtree.ElementTree import (ElementTree, SubElement, |
|
1248
by Martin Pool
- new weave based cleanup [broken] |
48 |
Element, XMLTreeBuilder, |
49 |
fromstring, tostring) |
|
|
1772.1.1
by mbp at sourcefrog
Fix up loading of fallback ElementTree |
50 |
import util.elementtree as elementtree |
|
2029.2.1
by Marien Zwart
Handle the different exception (non-c)ElementTree raises. |
51 |
from xml.parsers.expat import ExpatError as ParseError |
|
802
by Martin Pool
- Remove XMLMixin class in favour of simple pack_xml, unpack_xml functions |
52 |
|
|
6355.1.1
by Jelmer Vernooij
Move some utility functions to xml_serializer. |
53 |
from bzrlib import ( |
54 |
cache_utf8, |
|
55 |
lazy_regex, |
|
56 |
errors, |
|
57 |
)
|
|
|
1180
by Martin Pool
- start splitting code for xml (de)serialization away from objects |
58 |
|
59 |
||
|
4237.3.1
by Jelmer Vernooij
Add new module with generic serializer information; keep XML-specific bits in |
60 |
class XMLSerializer(Serializer): |
61 |
"""Abstract XML object serialize/deserialize""" |
|
|
1248
by Martin Pool
- new weave based cleanup [broken] |
62 |
|
|
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
63 |
squashes_xml_invalid_characters = True |
64 |
||
|
3882.6.23
by John Arbash Meinel
Change the XMLSerializer.read_inventory_from_string api. |
65 |
def read_inventory_from_string(self, xml_string, revision_id=None, |
|
4849.4.2
by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute. |
66 |
entry_cache=None, return_from_cache=False): |
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
67 |
"""Read xml_string into an inventory object. |
68 |
||
69 |
:param xml_string: The xml to read.
|
|
70 |
:param revision_id: If not-None, the expected revision id of the
|
|
71 |
inventory. Some serialisers use this to set the results' root
|
|
|
3169.2.2
by Robert Collins
Add a test to Repository.deserialise_inventory that the resulting ivnentory is the one asked for, and update relevant tests. Also tweak the model 1 to 2 regenerate inventories logic to use the revision trees parent marker which is more accurate in some cases. |
72 |
revision. This should be supplied for deserialising all
|
73 |
from-repository inventories so that xml5 inventories that were
|
|
74 |
serialised without a revision identifier can be given the right
|
|
75 |
revision id (but not for working tree inventories where users can
|
|
76 |
edit the data without triggering checksum errors or anything).
|
|
|
3882.6.23
by John Arbash Meinel
Change the XMLSerializer.read_inventory_from_string api. |
77 |
:param entry_cache: An optional cache of InventoryEntry objects. If
|
78 |
supplied we will look up entries via (file_id, revision_id) which
|
|
79 |
should map to a valid InventoryEntry (File/Directory/etc) object.
|
|
|
4849.4.2
by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute. |
80 |
:param return_from_cache: Return entries directly from the cache,
|
81 |
rather than copying them first. This is only safe if the caller
|
|
82 |
promises not to mutate the returned inventory entries, but it can
|
|
83 |
make some operations significantly faster.
|
|
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
84 |
"""
|
|
1910.2.31
by Aaron Bentley
Fix bugs in basis inventory handling, change filename |
85 |
try: |
|
3882.6.23
by John Arbash Meinel
Change the XMLSerializer.read_inventory_from_string api. |
86 |
return self._unpack_inventory(fromstring(xml_string), revision_id, |
|
4849.4.2
by John Arbash Meinel
Change from being a per-serializer attribute to being a per-repo attribute. |
87 |
entry_cache=entry_cache, |
88 |
return_from_cache=return_from_cache) |
|
|
2029.2.1
by Marien Zwart
Handle the different exception (non-c)ElementTree raises. |
89 |
except ParseError, e: |
|
1910.2.31
by Aaron Bentley
Fix bugs in basis inventory handling, change filename |
90 |
raise errors.UnexpectedInventoryFormat(e) |
|
1227
by Martin Pool
- methods to deserialize objects from strings |
91 |
|
|
2889.1.1
by Robert Collins
* The class ``bzrlib.repofmt.knitrepo.KnitRepository3`` has been folded into |
92 |
def read_inventory(self, f, revision_id=None): |
|
1910.2.31
by Aaron Bentley
Fix bugs in basis inventory handling, change filename |
93 |
try: |
|
4708.2.1
by Martin
Ensure all files opened by bazaar proper are explicitly closed |
94 |
try: |
95 |
return self._unpack_inventory(self._read_element(f), |
|
96 |
revision_id=None) |
|
97 |
finally: |
|
98 |
f.close() |
|
|
2029.2.1
by Marien Zwart
Handle the different exception (non-c)ElementTree raises. |
99 |
except ParseError, e: |
|
1910.2.31
by Aaron Bentley
Fix bugs in basis inventory handling, change filename |
100 |
raise errors.UnexpectedInventoryFormat(e) |
|
1180
by Martin Pool
- start splitting code for xml (de)serialization away from objects |
101 |
|
|
1182
by Martin Pool
- more disentangling of xml storage format from objects |
102 |
def write_revision(self, rev, f): |
103 |
self._write_element(self._pack_revision(rev), f) |
|
104 |
||
|
1248
by Martin Pool
- new weave based cleanup [broken] |
105 |
def write_revision_to_string(self, rev): |
|
1185.16.123
by Martin Pool
Fix syntax of serializer_v5.pack_revision_to_string |
106 |
return tostring(self._pack_revision(rev)) + '\n' |
|
1248
by Martin Pool
- new weave based cleanup [broken] |
107 |
|
|
1182
by Martin Pool
- more disentangling of xml storage format from objects |
108 |
def read_revision(self, f): |
109 |
return self._unpack_revision(self._read_element(f)) |
|
110 |
||
|
1227
by Martin Pool
- methods to deserialize objects from strings |
111 |
def read_revision_from_string(self, xml_string): |
|
1248
by Martin Pool
- new weave based cleanup [broken] |
112 |
return self._unpack_revision(fromstring(xml_string)) |
|
1227
by Martin Pool
- methods to deserialize objects from strings |
113 |
|
|
1180
by Martin Pool
- start splitting code for xml (de)serialization away from objects |
114 |
def _write_element(self, elt, f): |
115 |
ElementTree(elt).write(f, 'utf-8') |
|
116 |
f.write('\n') |
|
117 |
||
118 |
def _read_element(self, f): |
|
119 |
return ElementTree().parse(f) |
|
|
1713.1.12
by Robert Collins
Improve serialisation of xml performance by overriding elementree's escape routines. |
120 |
|
121 |
||
|
4222.1.1
by Jelmer Vernooij
Make function for escaping invalid XML characters public. |
122 |
def escape_invalid_chars(message): |
123 |
"""Escape the XML-invalid characters in a commit message. |
|
124 |
||
125 |
:param message: Commit message to escape
|
|
|
4354.1.1
by Jelmer Vernooij
Fix docstring for bzrlib.xml_serializer.escape_invalid_chars. |
126 |
:return: tuple with escaped message and number of characters escaped
|
|
4222.1.1
by Jelmer Vernooij
Make function for escaping invalid XML characters public. |
127 |
"""
|
|
4416.5.1
by Jelmer Vernooij
Move squashing of XML-invalid characters to XMLSerializer. |
128 |
if message is None: |
129 |
return None, 0 |
|
|
4222.1.1
by Jelmer Vernooij
Make function for escaping invalid XML characters public. |
130 |
# Python strings can include characters that can't be
|
131 |
# represented in well-formed XML; escape characters that
|
|
132 |
# aren't listed in the XML specification
|
|
133 |
# (http://www.w3.org/TR/REC-xml/#NT-Char).
|
|
134 |
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+', |
|
135 |
lambda match: match.group(0).encode('unicode_escape'), |
|
136 |
message) |
|
|
6355.1.1
by Jelmer Vernooij
Move some utility functions to xml_serializer. |
137 |
|
138 |
||
139 |
def get_utf8_or_ascii(a_str, |
|
140 |
_encode_utf8=cache_utf8.encode, |
|
141 |
_get_cached_ascii=cache_utf8.get_cached_ascii): |
|
142 |
"""Return a cached version of the string. |
|
143 |
||
144 |
cElementTree will return a plain string if the XML is plain ascii. It only
|
|
145 |
returns Unicode when it needs to. We want to work in utf-8 strings. So if
|
|
146 |
cElementTree returns a plain string, we can just return the cached version.
|
|
147 |
If it is Unicode, then we need to encode it.
|
|
148 |
||
149 |
:param a_str: An 8-bit string or Unicode as returned by
|
|
150 |
cElementTree.Element.get()
|
|
151 |
:return: A utf-8 encoded 8-bit string.
|
|
152 |
"""
|
|
153 |
# This is fairly optimized because we know what cElementTree does, this is
|
|
154 |
# not meant as a generic function for all cases. Because it is possible for
|
|
155 |
# an 8-bit string to not be ascii or valid utf8.
|
|
156 |
if a_str.__class__ is unicode: |
|
157 |
return _encode_utf8(a_str) |
|
158 |
else: |
|
159 |
return intern(a_str) |
|
160 |
||
161 |
||
162 |
_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+') |
|
163 |
_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]') |
|
164 |
||
165 |
||
166 |
_xml_escape_map = { |
|
167 |
"&":'&', |
|
168 |
"'":"'", # FIXME: overkill |
|
169 |
"\"":""", |
|
170 |
"<":"<", |
|
171 |
">":">", |
|
172 |
}
|
|
173 |
||
174 |
||
175 |
def _unicode_escape_replace(match, _map=_xml_escape_map): |
|
176 |
"""Replace a string of non-ascii, non XML safe characters with their escape |
|
177 |
||
178 |
This will escape both Standard XML escapes, like <>"', etc.
|
|
179 |
As well as escaping non ascii characters, because ElementTree did.
|
|
180 |
This helps us remain compatible to older versions of bzr. We may change
|
|
181 |
our policy in the future, though.
|
|
182 |
"""
|
|
183 |
# jam 20060816 Benchmarks show that try/KeyError is faster if you
|
|
184 |
# expect the entity to rarely miss. There is about a 10% difference
|
|
185 |
# in overall time. But if you miss frequently, then if None is much
|
|
186 |
# faster. For our use case, we *rarely* have a revision id, file id
|
|
187 |
# or path name that is unicode. So use try/KeyError.
|
|
188 |
try: |
|
189 |
return _map[match.group()] |
|
190 |
except KeyError: |
|
191 |
return "&#%d;" % ord(match.group()) |
|
192 |
||
193 |
||
194 |
def _utf8_escape_replace(match, _map=_xml_escape_map): |
|
195 |
"""Escape utf8 characters into XML safe ones. |
|
196 |
||
197 |
This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
|
|
198 |
or it is handling characters with the high-bit set. For ascii characters,
|
|
199 |
we just lookup the replacement in the dictionary. For everything else, we
|
|
200 |
decode back into Unicode, and then use the XML escape code.
|
|
201 |
"""
|
|
202 |
try: |
|
203 |
return _map[match.group()] |
|
204 |
except KeyError: |
|
205 |
return ''.join('&#%d;' % ord(uni_chr) |
|
206 |
for uni_chr in match.group().decode('utf8')) |
|
207 |
||
208 |
||
209 |
_to_escaped_map = {} |
|
210 |
||
211 |
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map): |
|
212 |
"""Encode the string into utf8, and escape invalid XML characters""" |
|
213 |
# We frequently get entities we have not seen before, so it is better
|
|
214 |
# to check if None, rather than try/KeyError
|
|
215 |
text = _map.get(unicode_or_utf8_str) |
|
216 |
if text is None: |
|
217 |
if unicode_or_utf8_str.__class__ is unicode: |
|
218 |
# The alternative policy is to do a regular UTF8 encoding
|
|
219 |
# and then escape only XML meta characters.
|
|
220 |
# Performance is equivalent once you use cache_utf8. *However*
|
|
221 |
# this makes the serialized texts incompatible with old versions
|
|
222 |
# of bzr. So no net gain. (Perhaps the read code would handle utf8
|
|
223 |
# better than entity escapes, but cElementTree seems to do just fine
|
|
224 |
# either way)
|
|
225 |
text = str(_unicode_re.sub(_unicode_escape_replace, |
|
226 |
unicode_or_utf8_str)) + '"' |
|
227 |
else: |
|
228 |
# Plain strings are considered to already be in utf-8 so we do a
|
|
229 |
# slightly different method for escaping.
|
|
230 |
text = _utf8_re.sub(_utf8_escape_replace, |
|
231 |
unicode_or_utf8_str) + '"' |
|
232 |
_map[unicode_or_utf8_str] = text |
|
233 |
return text |
|
234 |
||
235 |
||
236 |
def _clear_cache(): |
|
237 |
"""Clean out the unicode => escaped map""" |
|
238 |
_to_escaped_map.clear() |