17
17
"""A collection of function for handling URL operations."""
19
from __future__ import absolute_import
25
from bzrlib.lazy_import import lazy_import
26
lazy_import(globals(), """
27
from posixpath import split as _posix_split
23
from urllib import parse as urlparse
30
from .lazy_import import lazy_import
31
lazy_import(globals(), """
32
from posixpath import split as _posix_split
37
class InvalidURL(errors.PathError):
39
_fmt = 'Invalid url supplied to transport: "%(path)s"%(extra)s'
42
class InvalidURLJoin(errors.PathError):
44
_fmt = "Invalid URL join request: %(reason)s: %(base)r + %(join_args)r"
46
def __init__(self, reason, base, join_args):
49
self.join_args = join_args
50
errors.PathError.__init__(self, base, reason)
53
class InvalidRebaseURLs(errors.PathError):
55
_fmt = "URLs differ by more than path: %(from_)r and %(to)r"
57
def __init__(self, from_, to):
60
errors.PathError.__init__(
61
self, from_, 'URLs differ by more than path.')
37
64
def basename(url, exclude_trailing_slash=True):
38
65
"""Return the last component of a URL.
61
88
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
64
# Private copies of quote and unquote, copied from Python's
65
# urllib module because urllib unconditionally imports socket, which imports
68
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
69
'abcdefghijklmnopqrstuvwxyz'
72
for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
73
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
77
def quote(s, safe='/'):
78
"""quote('abc def') -> 'abc%20def'
80
Each part of a URL, e.g. the path info, the query, etc., has a
81
different set of reserved characters that must be quoted.
83
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
84
the following reserved characters.
86
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
89
Each of these characters is reserved in some component of a URL,
90
but not necessarily in all of them.
92
By default, the quote function is intended for quoting the path
93
section of a URL. Thus, it will not encode '/'. This character
94
is reserved, but in typical usage the quote function is being
95
called on a path where the existing slash characters are used as
101
raise TypeError('None object cannot be quoted')
103
cachekey = (safe, always_safe)
105
(quoter, safe) = _safe_quoters[cachekey]
107
safe_map = _safe_map.copy()
108
safe_map.update([(c, c) for c in safe])
109
quoter = safe_map.__getitem__
110
safe = always_safe + safe
111
_safe_quoters[cachekey] = (quoter, safe)
112
if not s.rstrip(safe):
114
return ''.join(map(quoter, s))
117
_hexdig = '0123456789ABCDEFabcdef'
118
_hextochr = dict((a + b, chr(int(a + b, 16)))
119
for a in _hexdig for b in _hexdig)
122
"""unquote('abc%20def') -> 'abc def'."""
130
s += _hextochr[item[:2]] + item[2:]
133
except UnicodeDecodeError:
134
s += unichr(int(item[:2], 16)) + item[2:]
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
94
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
139
98
"""Escape relpath to be a valid url."""
140
if isinstance(relpath, unicode):
141
relpath = relpath.encode('utf-8')
142
# After quoting and encoding, the path should be perfectly
143
# safe as a plain ASCII string, str() just enforces this
144
return str(quote(relpath, safe='/~'))
99
return quote(relpath, safe=safe)
147
102
def file_relpath(base, path):
282
237
def _win32_local_path_from_url(url):
283
238
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
284
239
if not url.startswith('file://'):
285
raise errors.InvalidURL(url, 'local urls must start with file:///, '
286
'UNC path urls must start with file://')
287
url = split_segment_parameters_raw(url)[0]
240
raise InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
288
243
# We strip off all 3 slashes
289
244
win32_url = url[len('file:'):]
290
245
# check for UNC path: //HOST/path
291
246
if not win32_url.startswith('///'):
292
247
if (win32_url[2] == '/'
293
or win32_url[3] in '|:'):
294
raise errors.InvalidURL(url, 'Win32 UNC path urls'
295
' have form file://HOST/path')
248
or win32_url[3] in '|:'):
249
raise InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
296
251
return unescape(win32_url)
298
253
# allow empty paths so we can serve all roots
385
340
return local_path_to_url(url)
386
341
prefix = url[:path_start]
387
342
path = url[path_start:]
388
if not isinstance(url, unicode):
343
if not isinstance(url, str):
390
345
if c not in _url_safe_characters:
391
raise errors.InvalidURL(url, 'URLs can only contain specific'
392
' safe characters (not %r)' % c)
346
raise InvalidURL(url, 'URLs can only contain specific'
347
' safe characters (not %r)' % c)
393
348
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
394
349
return str(prefix + ''.join(path))
396
351
# We have a unicode (hybrid) url
397
352
path_chars = list(path)
399
for i in xrange(len(path_chars)):
354
for i in range(len(path_chars)):
400
355
if path_chars[i] not in _url_safe_characters:
401
chars = path_chars[i].encode('utf-8')
402
356
path_chars[i] = ''.join(
403
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
357
['%%%02X' % c for c in bytearray(path_chars[i].encode('utf-8'))])
404
358
path = ''.join(path_chars)
405
359
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
406
360
return str(prefix + path)
426
380
if base_scheme != other_scheme:
428
382
elif sys.platform == 'win32' and base_scheme == 'file://':
429
base_drive = base[base_first_slash+1:base_first_slash+3]
430
other_drive = other[other_first_slash+1:other_first_slash+3]
383
base_drive = base[base_first_slash + 1:base_first_slash + 3]
384
other_drive = other[other_first_slash + 1:other_first_slash + 3]
431
385
if base_drive != other_drive:
434
base_path = base[base_first_slash+1:]
435
other_path = other[other_first_slash+1:]
388
base_path = base[base_first_slash + 1:]
389
other_path = other[other_first_slash + 1:]
437
391
if base_path.endswith('/'):
438
392
base_path = base_path[:-1]
519
473
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
520
474
# operates on urls not url+segments, and Transport classes
521
# should not be blindly adding slashes in the first place.
475
# should not be blindly adding slashes in the first place.
522
476
lurl = strip_trailing_slash(url)
523
477
# Segments begin at first comma after last forward slash, if one exists
524
segment_start = lurl.find(",", lurl.rfind("/")+1)
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
525
479
if segment_start == -1:
527
return (lurl[:segment_start], lurl[segment_start+1:].split(","))
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
530
485
def split_segment_parameters(url):
536
491
(base_url, subsegments) = split_segment_parameters_raw(url)
538
493
for subsegment in subsegments:
539
(key, value) = subsegment.split("=", 1)
495
(key, value) = subsegment.split("=", 1)
497
raise InvalidURL(url, "missing = in subsegment")
498
if not isinstance(key, str):
500
if not isinstance(value, str):
501
raise TypeError(value)
540
502
parameters[key] = value
541
503
return (base_url, parameters)
506
def strip_segment_parameters(url):
507
"""Strip the segment parameters from a URL.
509
:param url: A relative or absolute URL
512
base_url, subsegments = split_segment_parameters_raw(url)
544
516
def join_segment_parameters_raw(base, *subsegments):
545
"""Create a new URL by adding subsegments to an existing one.
517
"""Create a new URL by adding subsegments to an existing one.
547
519
This adds the specified subsegments to the last path in the specified
548
520
base URL. The subsegments should be bytestrings.
572
544
(base, existing_parameters) = split_segment_parameters(url)
573
545
new_parameters = {}
574
546
new_parameters.update(existing_parameters)
575
for key, value in parameters.iteritems():
576
if type(key) is not str:
577
raise TypeError("parameter key %r is not a bytestring" % key)
578
if type(value) is not str:
579
raise TypeError("parameter value %r for %s is not a bytestring" %
547
for key, value in parameters.items():
548
if not isinstance(key, str):
549
raise TypeError("parameter key %r is not a str" % key)
550
if not isinstance(value, str):
551
raise TypeError("parameter value %r for %r is not a str" %
582
raise errors.InvalidURLJoin("= exists in parameter key", url,
554
raise InvalidURLJoin("= exists in parameter key", url,
584
556
new_parameters[key] = value
585
return join_segment_parameters_raw(base,
586
*["%s=%s" % item for item in sorted(new_parameters.items())])
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
589
561
def _win32_strip_local_trailing_slash(url):
647
619
# plain ASCII strings, or the final .decode will
648
620
# try to encode the UNICODE => ASCII, and then decode
652
except UnicodeError, e:
653
raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
655
unquoted = unquote(url)
657
unicode_path = unquoted.decode('utf-8')
658
except UnicodeError, e:
659
raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
623
if isinstance(url, str):
626
except UnicodeError as e:
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
663
632
# These are characters that if escaped, should stay that way
664
633
_no_decode_chars = ';/?:@&=+$,#'
665
634
_no_decode_ords = [ord(c) for c in _no_decode_chars]
666
635
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
667
+ ['%02X' % o for o in _no_decode_ords])
668
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
669
+ [('%02X' % o, chr(o)) for o in range(256)]))
670
#These entries get mapped to themselves
671
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
673
643
# These characters shouldn't be percent-encoded, and it's always safe to
674
644
# unencode them if they are.
675
645
_url_dont_escape_characters = set(
676
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
677
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
678
"0123456789" # Numbers
679
"-._~" # Unreserved characters
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
682
652
# These characters should not be escaped
683
653
_url_safe_characters = set(
684
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
685
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
686
"0123456789" # Numbers
687
"_.-!~*'()" # Unreserved characters
688
"/;?:@&=+$," # Reserved characters
689
"%#" # Extra reserved characters
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
692
705
def unescape_for_display(url, encoding):
693
706
"""Decode what you can for a URL, so that we get a nice looking path.
717
730
# Split into sections to try to decode utf-8
718
731
res = url.split('/')
719
for i in xrange(1, len(res)):
720
escaped_chunks = res[i].split('%')
721
for j in xrange(1, len(escaped_chunks)):
722
item = escaped_chunks[j]
724
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
726
# Put back the percent symbol
727
escaped_chunks[j] = '%' + item
728
except UnicodeDecodeError:
729
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
730
unescaped = ''.join(escaped_chunks)
732
decoded = unescaped.decode('utf-8')
733
except UnicodeDecodeError:
734
# If this path segment cannot be properly utf-8 decoded
735
# after doing unescaping we will just leave it alone
739
decoded.encode(encoding)
740
except UnicodeEncodeError:
741
# If this chunk cannot be encoded in the local
742
# encoding, then we should leave it alone
745
# Otherwise take the url decoded one
732
for i in range(1, len(res)):
733
res[i] = _unescape_segment_for_display(res[i], encoding)
747
734
return u'/'.join(res)
853
842
:param url: URL as bytestring
855
if isinstance(url, unicode):
856
raise errors.InvalidURL('should be ascii:\n%r' % url)
857
url = url.encode('utf-8')
844
# GZ 2017-06-09: Actually validate ascii-ness
845
# pad.lv/1696545: For the moment, accept both native strings and
847
if isinstance(url, str):
849
elif isinstance(url, str):
852
except UnicodeEncodeError:
853
raise InvalidURL(url)
855
raise InvalidURL(url)
858
856
(scheme, netloc, path, params,
859
857
query, fragment) = urlparse.urlparse(url, allow_fragments=False)
860
858
user = password = host = port = None
912
913
:param relpath: relative url string for relative part of remote path.
913
914
:return: urlencoded string for final path.
915
if not isinstance(relpath, str):
916
raise errors.InvalidURL(relpath)
916
# pad.lv/1696545: For the moment, accept both native strings and
918
if isinstance(relpath, str):
920
elif isinstance(relpath, str):
922
relpath = relpath.encode()
923
except UnicodeEncodeError:
924
raise InvalidURL(relpath)
926
raise InvalidURL(relpath)
917
927
relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
918
928
if relpath.startswith('/'):