19
19
"""A collection of function for handling URL operations."""
22
from posixpath import split as _posix_split, normpath as _posix_normpath
25
from bzrlib.lazy_import import lazy_import
26
lazy_import(globals(), """
27
from posixpath import split as _posix_split, normpath as _posix_normpath
27
import bzrlib.errors as errors
31
37
def basename(url, exclude_trailing_slash=True):
111
117
join('http://foo', 'bar') => 'http://foo/bar'
112
118
join('http://foo', 'bar', '../baz') => 'http://foo/baz'
114
m = _url_scheme_re.match(base)
122
match = _url_scheme_re.match(base)
117
scheme = m.group('scheme')
118
path = m.group('path').split('/')
125
scheme = match.group('scheme')
126
path = match.group('path').split('/')
119
127
if path[-1:] == ['']:
120
128
# Strip off a trailing slash
121
129
# This helps both when we are at the root, and when
125
133
path = base.split('/')
135
if scheme is not None and len(path) >= 1:
137
# the path should be represented as an abs path.
138
# we know this must be absolute because of the presence of a URL scheme.
140
path = [''] + path[1:]
142
# create an empty host, but dont alter the path - this might be a
143
# relative url fragment.
128
m = _url_scheme_re.match(arg)
148
match = _url_scheme_re.match(arg)
131
scheme = m.group('scheme')
132
path = m.group('path').split('/')
151
scheme = match.group('scheme')
152
# this skips .. normalisation, making http://host/../../..
154
path = match.group('path').split('/')
155
# set the host and path according to new absolute URL, discarding
156
# any previous values.
157
# XXX: duplicates mess from earlier in this function. This URL
158
# manipulation code needs some cleaning up.
159
if scheme is not None and len(path) >= 1:
162
# url scheme implies absolute path.
165
# no url scheme we take the path as is.
134
for chunk in arg.split('/'):
139
# Don't pop off the host portion
142
raise errors.InvalidURLJoin('Cannot go above root',
168
path = '/'.join(path)
169
path = joinpath(path, arg)
170
path = path.split('/')
171
if remove_root and path[0:1] == ['']:
174
# Remove the leading slash from the path, so long as it isn't also the
175
# trailing slash, which we want to keep if present.
176
if path and path[0] == '' and len(path) > 1:
147
180
if scheme is None:
148
181
return '/'.join(path)
149
182
return scheme + '://' + '/'.join(path)
185
def joinpath(base, *args):
186
"""Join URL path segments to a URL path segment.
188
This is somewhat like osutils.joinpath, but intended for URLs.
190
XXX: this duplicates some normalisation logic, and also duplicates a lot of
191
path handling logic that already exists in some Transport implementations.
192
We really should try to have exactly one place in the code base responsible
193
for combining paths of URLs.
195
path = base.split('/')
196
if len(path) > 1 and path[-1] == '':
197
#If the path ends in a trailing /, remove it.
200
if arg.startswith('/'):
202
for chunk in arg.split('/'):
207
raise errors.InvalidURLJoin('Cannot go above root',
215
return '/'.join(path)
152
218
# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
153
219
def _posix_local_path_from_url(url):
154
220
"""Convert a url like file:///path/to/foo into /path/to/foo"""
166
232
# importing directly from posixpath allows us to test this
167
233
# on non-posix platforms
168
234
return 'file://' + escape(_posix_normpath(
169
bzrlib.osutils._posix_abspath(path)))
235
osutils._posix_abspath(path)))
172
238
def _win32_local_path_from_url(url):
173
239
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
174
if not url.startswith('file:///'):
175
raise errors.InvalidURL(url, 'local urls must start with file:///')
240
if not url.startswith('file://'):
241
raise errors.InvalidURL(url, 'local urls must start with file:///, '
242
'UNC path urls must start with file://')
176
243
# We strip off all 3 slashes
177
win32_url = url[len('file:///'):]
178
if (win32_url[0] not in ('abcdefghijklmnopqrstuvwxyz'
244
win32_url = url[len('file:'):]
245
# check for UNC path: //HOST/path
246
if not win32_url.startswith('///'):
247
if (win32_url[2] == '/'
248
or win32_url[3] in '|:'):
249
raise errors.InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
251
return unescape(win32_url)
252
# usual local path with drive letter
253
if (win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
179
254
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
180
or win32_url[1] not in '|:'
181
or win32_url[2] != '/'):
255
or win32_url[4] not in '|:'
256
or win32_url[5] != '/'):
182
257
raise errors.InvalidURL(url, 'Win32 file urls start with'
183
258
' file:///x:/, where x is a valid drive letter')
184
return win32_url[0].upper() + u':' + unescape(win32_url[2:])
259
return win32_url[3].upper() + u':' + unescape(win32_url[5:])
187
262
def _win32_local_path_to_url(path):
195
270
# which actually strips trailing space characters.
196
271
# The worst part is that under linux ntpath.abspath has different
197
272
# semantics, since 'nt' is not an available module.
198
win32_path = bzrlib.osutils._nt_normpath(
199
bzrlib.osutils._win32_abspath(path)).replace('\\', '/')
273
win32_path = osutils._win32_abspath(path)
274
# check for UNC path \\HOST\path
275
if win32_path.startswith('//'):
276
return 'file:' + escape(win32_path)
200
277
return 'file:///' + win32_path[0].upper() + ':' + escape(win32_path[2:])
215
292
_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
293
_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
296
def _unescape_safe_chars(matchobj):
297
"""re.sub callback to convert hex-escapes to plain characters (if safe).
299
e.g. '%7E' will be converted to '~'.
301
hex_digits = matchobj.group(0)[1:]
302
char = chr(int(hex_digits, 16))
303
if char in _url_dont_escape_characters:
306
return matchobj.group(0).upper()
218
309
def normalize_url(url):
219
310
"""Make sure that a path string is in fully normalized URL form.
221
This handles URLs which have unicode characters, spaces,
312
This handles URLs which have unicode characters, spaces,
222
313
special characters, etc.
224
315
It has two basic modes of operation, depending on whether the
237
328
m = _url_scheme_re.match(url)
239
330
return local_path_to_url(url)
331
scheme = m.group('scheme')
332
path = m.group('path')
240
333
if not isinstance(url, unicode):
242
335
if c not in _url_safe_characters:
243
336
raise errors.InvalidURL(url, 'URLs can only contain specific'
244
337
' safe characters (not %r)' % c)
338
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
339
return str(scheme + '://' + ''.join(path))
246
341
# We have a unicode (hybrid) url
247
scheme = m.group('scheme')
248
path = list(m.group('path'))
342
path_chars = list(path)
250
for i in xrange(len(path)):
251
if path[i] not in _url_safe_characters:
252
chars = path[i].encode('utf-8')
253
path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
254
return scheme + '://' + ''.join(path)
344
for i in xrange(len(path_chars)):
345
if path_chars[i] not in _url_safe_characters:
346
chars = path_chars[i].encode('utf-8')
347
path_chars[i] = ''.join(
348
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
349
path = ''.join(path_chars)
350
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
351
return str(scheme + '://' + path)
257
354
def relative_url(base, other):
437
534
#These entries get mapped to themselves
438
535
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
537
# These characters shouldn't be percent-encoded, and it's always safe to
538
# unencode them if they are.
539
_url_dont_escape_characters = set(
540
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
541
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
542
"0123456789" # Numbers
543
"-._~" # Unreserved characters
440
546
# These characters should not be escaped
441
_url_safe_characters = set('abcdefghijklmnopqrstuvwxyz'
442
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
547
_url_safe_characters = set(
548
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
549
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
550
"0123456789" # Numbers
551
"_.-!~*'()" # Unreserved characters
552
"/;?:@&=+$," # Reserved characters
553
"%#" # Extra reserved characters
447
556
def unescape_for_display(url, encoding):
448
557
"""Decode what you can for a URL, so that we get a nice looking path.