/brz/remove-bazaar : revision 6355.1.1

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/xml_serializer.py

Committer: Jelmer Vernooij
Date: 2011-12-12 10:55:01 UTC
mto: This revision was merged to the branch mainline in revision 6361.
Revision ID: jelmer@samba.org-20111212105501-bxfi2xlqd20eax76

Move some utility functions to xml_serializer.

files modified:
bzrlib/xml5.py

bzrlib/xml8.py

bzrlib/xml_serializer.py

Show diffs side-by-side

added added

removed removed

bzrlib/xml_serializer.py

import util.elementtree as elementtree

from xml.parsers.expat import ExpatError as ParseError

from bzrlib import errors

from bzrlib import (

cache_utf8,

lazy_regex,

errors,

)

class XMLSerializer(Serializer):

130

134

return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',

131

135

lambda match: match.group(0).encode('unicode_escape'),

132

136

message)

137

138

139

def get_utf8_or_ascii(a_str,

140

_encode_utf8=cache_utf8.encode,

141

_get_cached_ascii=cache_utf8.get_cached_ascii):

142

"""Return a cached version of the string.

143

144

cElementTree will return a plain string if the XML is plain ascii. It only

145

returns Unicode when it needs to. We want to work in utf-8 strings. So if

146

cElementTree returns a plain string, we can just return the cached version.

147

If it is Unicode, then we need to encode it.

148

149

:param a_str: An 8-bit string or Unicode as returned by

150

cElementTree.Element.get()

151

:return: A utf-8 encoded 8-bit string.

152

"""

153

# This is fairly optimized because we know what cElementTree does, this is

154

# not meant as a generic function for all cases. Because it is possible for

155

# an 8-bit string to not be ascii or valid utf8.

156

if a_str.__class__ is unicode:

157

return _encode_utf8(a_str)

158

else:

159

return intern(a_str)

160

161

162

_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')

163

_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')

164

165

166

_xml_escape_map = {

167

"&":'&',

168

"'":"'", # FIXME: overkill

169

"\"":""",

170

"<":"<",

171

">":">",

172

}

173

174

175

def _unicode_escape_replace(match, _map=_xml_escape_map):

176

"""Replace a string of non-ascii, non XML safe characters with their escape

177

178

This will escape both Standard XML escapes, like <>"', etc.

179

As well as escaping non ascii characters, because ElementTree did.

180

This helps us remain compatible to older versions of bzr. We may change

181

our policy in the future, though.

182

"""

183

# jam 20060816 Benchmarks show that try/KeyError is faster if you

184

# expect the entity to rarely miss. There is about a 10% difference

185

# in overall time. But if you miss frequently, then if None is much

186

# faster. For our use case, we *rarely* have a revision id, file id

187

# or path name that is unicode. So use try/KeyError.

188

try:

189

return _map[match.group()]

190

except KeyError:

191

return "&#%d;" % ord(match.group())

192

193

194

def _utf8_escape_replace(match, _map=_xml_escape_map):

195

"""Escape utf8 characters into XML safe ones.

196

197

This uses 2 tricks. It is either escaping "standard" characters, like "&<>,

198

or it is handling characters with the high-bit set. For ascii characters,

199

we just lookup the replacement in the dictionary. For everything else, we

200

decode back into Unicode, and then use the XML escape code.

201

"""

202

try:

203

return _map[match.group()]

204

except KeyError:

205

return ''.join('&#%d;' % ord(uni_chr)

206

for uni_chr in match.group().decode('utf8'))

207

208

209

_to_escaped_map = {}

210

211

def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):

212

"""Encode the string into utf8, and escape invalid XML characters"""

213

# We frequently get entities we have not seen before, so it is better

214

# to check if None, rather than try/KeyError

215

text = _map.get(unicode_or_utf8_str)

216

if text is None:

217

if unicode_or_utf8_str.__class__ is unicode:

218

# The alternative policy is to do a regular UTF8 encoding

219

# and then escape only XML meta characters.

220

# Performance is equivalent once you use cache_utf8. *However*

221

# this makes the serialized texts incompatible with old versions

222

# of bzr. So no net gain. (Perhaps the read code would handle utf8

223

# better than entity escapes, but cElementTree seems to do just fine

224

# either way)

225

text = str(_unicode_re.sub(_unicode_escape_replace,

226

unicode_or_utf8_str)) + '"'

227

else:

228

# Plain strings are considered to already be in utf-8 so we do a

229

# slightly different method for escaping.

230

text = _utf8_re.sub(_utf8_escape_replace,

231

unicode_or_utf8_str) + '"'

232

_map[unicode_or_utf8_str] = text

233

return text

234

235

236

def _clear_cache():

237

"""Clean out the unicode => escaped map"""

238

_to_escaped_map.clear()

Older »