17
17
"""A collection of function for handling URL operations."""
19
from __future__ import absolute_import
23
from urllib import parse as urlparse
25
from brzlib.lazy_import import lazy_import
26
lazy_import(globals(), """
27
from posixpath import split as _posix_split
30
from .lazy_import import lazy_import
31
lazy_import(globals(), """
32
from posixpath import split as _posix_split
37
class InvalidURL(errors.PathError):
39
_fmt = 'Invalid url supplied to transport: "%(path)s"%(extra)s'
42
class InvalidURLJoin(errors.PathError):
44
_fmt = "Invalid URL join request: %(reason)s: %(base)r + %(join_args)r"
46
def __init__(self, reason, base, join_args):
49
self.join_args = join_args
50
errors.PathError.__init__(self, base, reason)
53
class InvalidRebaseURLs(errors.PathError):
55
_fmt = "URLs differ by more than path: %(from_)r and %(to)r"
57
def __init__(self, from_, to):
60
errors.PathError.__init__(
61
self, from_, 'URLs differ by more than path.')
64
37
def basename(url, exclude_trailing_slash=True):
65
38
"""Return the last component of a URL.
88
61
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
94
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
64
# Private copies of quote and unquote, copied from Python's
65
# urllib module because urllib unconditionally imports socket, which imports
68
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
69
'abcdefghijklmnopqrstuvwxyz'
72
for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
73
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
77
def quote(s, safe='/'):
78
"""quote('abc def') -> 'abc%20def'
80
Each part of a URL, e.g. the path info, the query, etc., has a
81
different set of reserved characters that must be quoted.
83
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
84
the following reserved characters.
86
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
89
Each of these characters is reserved in some component of a URL,
90
but not necessarily in all of them.
92
By default, the quote function is intended for quoting the path
93
section of a URL. Thus, it will not encode '/'. This character
94
is reserved, but in typical usage the quote function is being
95
called on a path where the existing slash characters are used as
101
raise TypeError('None object cannot be quoted')
103
cachekey = (safe, always_safe)
105
(quoter, safe) = _safe_quoters[cachekey]
107
safe_map = _safe_map.copy()
108
safe_map.update([(c, c) for c in safe])
109
quoter = safe_map.__getitem__
110
safe = always_safe + safe
111
_safe_quoters[cachekey] = (quoter, safe)
112
if not s.rstrip(safe):
114
return ''.join(map(quoter, s))
117
_hexdig = '0123456789ABCDEFabcdef'
118
_hextochr = dict((a + b, chr(int(a + b, 16)))
119
for a in _hexdig for b in _hexdig)
122
"""unquote('abc%20def') -> 'abc def'."""
130
s += _hextochr[item[:2]] + item[2:]
133
except UnicodeDecodeError:
134
s += unichr(int(item[:2], 16)) + item[2:]
98
139
"""Escape relpath to be a valid url."""
99
return quote(relpath, safe=safe)
140
if isinstance(relpath, unicode):
141
relpath = relpath.encode('utf-8')
142
# After quoting and encoding, the path should be perfectly
143
# safe as a plain ASCII string, str() just enforces this
144
return str(quote(relpath, safe='/~'))
102
147
def file_relpath(base, path):
237
282
def _win32_local_path_from_url(url):
238
283
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
239
284
if not url.startswith('file://'):
240
raise InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
285
raise errors.InvalidURL(url, 'local urls must start with file:///, '
286
'UNC path urls must start with file://')
287
url = split_segment_parameters_raw(url)[0]
243
288
# We strip off all 3 slashes
244
289
win32_url = url[len('file:'):]
245
290
# check for UNC path: //HOST/path
246
291
if not win32_url.startswith('///'):
247
292
if (win32_url[2] == '/'
248
or win32_url[3] in '|:'):
249
raise InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
293
or win32_url[3] in '|:'):
294
raise errors.InvalidURL(url, 'Win32 UNC path urls'
295
' have form file://HOST/path')
251
296
return unescape(win32_url)
253
298
# allow empty paths so we can serve all roots
340
385
return local_path_to_url(url)
341
386
prefix = url[:path_start]
342
387
path = url[path_start:]
343
if not isinstance(url, str):
388
if not isinstance(url, unicode):
345
390
if c not in _url_safe_characters:
346
raise InvalidURL(url, 'URLs can only contain specific'
347
' safe characters (not %r)' % c)
391
raise errors.InvalidURL(url, 'URLs can only contain specific'
392
' safe characters (not %r)' % c)
348
393
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
349
394
return str(prefix + ''.join(path))
351
396
# We have a unicode (hybrid) url
352
397
path_chars = list(path)
354
for i in range(len(path_chars)):
399
for i in xrange(len(path_chars)):
355
400
if path_chars[i] not in _url_safe_characters:
401
chars = path_chars[i].encode('utf-8')
356
402
path_chars[i] = ''.join(
357
['%%%02X' % c for c in bytearray(path_chars[i].encode('utf-8'))])
403
['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
358
404
path = ''.join(path_chars)
359
405
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
360
406
return str(prefix + path)
380
426
if base_scheme != other_scheme:
382
428
elif sys.platform == 'win32' and base_scheme == 'file://':
383
base_drive = base[base_first_slash + 1:base_first_slash + 3]
384
other_drive = other[other_first_slash + 1:other_first_slash + 3]
429
base_drive = base[base_first_slash+1:base_first_slash+3]
430
other_drive = other[other_first_slash+1:other_first_slash+3]
385
431
if base_drive != other_drive:
388
base_path = base[base_first_slash + 1:]
389
other_path = other[other_first_slash + 1:]
434
base_path = base[base_first_slash+1:]
435
other_path = other[other_first_slash+1:]
391
437
if base_path.endswith('/'):
392
438
base_path = base_path[:-1]
473
519
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
474
520
# operates on urls not url+segments, and Transport classes
475
# should not be blindly adding slashes in the first place.
521
# should not be blindly adding slashes in the first place.
476
522
lurl = strip_trailing_slash(url)
477
523
# Segments begin at first comma after last forward slash, if one exists
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
524
segment_start = lurl.find(",", lurl.rfind("/")+1)
479
525
if segment_start == -1:
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
527
return (lurl[:segment_start], lurl[segment_start+1:].split(","))
485
530
def split_segment_parameters(url):
491
536
(base_url, subsegments) = split_segment_parameters_raw(url)
493
538
for subsegment in subsegments:
495
(key, value) = subsegment.split("=", 1)
497
raise InvalidURL(url, "missing = in subsegment")
498
if not isinstance(key, str):
500
if not isinstance(value, str):
501
raise TypeError(value)
539
(key, value) = subsegment.split("=", 1)
502
540
parameters[key] = value
503
541
return (base_url, parameters)
506
def strip_segment_parameters(url):
507
"""Strip the segment parameters from a URL.
509
:param url: A relative or absolute URL
512
base_url, subsegments = split_segment_parameters_raw(url)
516
544
def join_segment_parameters_raw(base, *subsegments):
517
"""Create a new URL by adding subsegments to an existing one.
545
"""Create a new URL by adding subsegments to an existing one.
519
547
This adds the specified subsegments to the last path in the specified
520
548
base URL. The subsegments should be bytestrings.
544
572
(base, existing_parameters) = split_segment_parameters(url)
545
573
new_parameters = {}
546
574
new_parameters.update(existing_parameters)
547
for key, value in parameters.items():
548
if not isinstance(key, str):
549
raise TypeError("parameter key %r is not a str" % key)
550
if not isinstance(value, str):
551
raise TypeError("parameter value %r for %r is not a str" %
575
for key, value in parameters.iteritems():
576
if type(key) is not str:
577
raise TypeError("parameter key %r is not a bytestring" % key)
578
if type(value) is not str:
579
raise TypeError("parameter value %r for %s is not a bytestring" %
554
raise InvalidURLJoin("= exists in parameter key", url,
582
raise errors.InvalidURLJoin("= exists in parameter key", url,
556
584
new_parameters[key] = value
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
585
return join_segment_parameters_raw(base,
586
*["%s=%s" % item for item in sorted(new_parameters.items())])
561
589
def _win32_strip_local_trailing_slash(url):
619
647
# plain ASCII strings, or the final .decode will
620
648
# try to encode the UNICODE => ASCII, and then decode
652
except UnicodeError, e:
653
raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
623
if isinstance(url, str):
626
except UnicodeError as e:
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
655
unquoted = unquote(url)
657
unicode_path = unquoted.decode('utf-8')
658
except UnicodeError, e:
659
raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
632
663
# These are characters that if escaped, should stay that way
633
664
_no_decode_chars = ';/?:@&=+$,#'
634
665
_no_decode_ords = [ord(c) for c in _no_decode_chars]
635
666
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
667
+ ['%02X' % o for o in _no_decode_ords])
668
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
669
+ [('%02X' % o, chr(o)) for o in range(256)]))
670
#These entries get mapped to themselves
671
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
643
673
# These characters shouldn't be percent-encoded, and it's always safe to
644
674
# unencode them if they are.
645
675
_url_dont_escape_characters = set(
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
676
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
677
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
678
"0123456789" # Numbers
679
"-._~" # Unreserved characters
652
682
# These characters should not be escaped
653
683
_url_safe_characters = set(
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
684
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
685
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
686
"0123456789" # Numbers
687
"_.-!~*'()" # Unreserved characters
688
"/;?:@&=+$," # Reserved characters
689
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
705
692
def unescape_for_display(url, encoding):
706
693
"""Decode what you can for a URL, so that we get a nice looking path.
730
717
# Split into sections to try to decode utf-8
731
718
res = url.split('/')
732
for i in range(1, len(res)):
733
res[i] = _unescape_segment_for_display(res[i], encoding)
719
for i in xrange(1, len(res)):
720
escaped_chunks = res[i].split('%')
721
for j in xrange(1, len(escaped_chunks)):
722
item = escaped_chunks[j]
724
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
726
# Put back the percent symbol
727
escaped_chunks[j] = '%' + item
728
except UnicodeDecodeError:
729
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
730
unescaped = ''.join(escaped_chunks)
732
decoded = unescaped.decode('utf-8')
733
except UnicodeDecodeError:
734
# If this path segment cannot be properly utf-8 decoded
735
# after doing unescaping we will just leave it alone
739
decoded.encode(encoding)
740
except UnicodeEncodeError:
741
# If this chunk cannot be encoded in the local
742
# encoding, then we should leave it alone
745
# Otherwise take the url decoded one
734
747
return u'/'.join(res)
842
853
:param url: URL as bytestring
844
# GZ 2017-06-09: Actually validate ascii-ness
845
# pad.lv/1696545: For the moment, accept both native strings and
847
if isinstance(url, str):
849
elif isinstance(url, str):
852
except UnicodeEncodeError:
853
raise InvalidURL(url)
855
raise InvalidURL(url)
855
if isinstance(url, unicode):
856
raise errors.InvalidURL('should be ascii:\n%r' % url)
857
url = url.encode('utf-8')
856
858
(scheme, netloc, path, params,
857
859
query, fragment) = urlparse.urlparse(url, allow_fragments=False)
858
860
user = password = host = port = None
913
912
:param relpath: relative url string for relative part of remote path.
914
913
:return: urlencoded string for final path.
916
# pad.lv/1696545: For the moment, accept both native strings and
918
if isinstance(relpath, str):
920
elif isinstance(relpath, str):
922
relpath = relpath.encode()
923
except UnicodeEncodeError:
924
raise InvalidURL(relpath)
926
raise InvalidURL(relpath)
915
if not isinstance(relpath, str):
916
raise errors.InvalidURL(relpath)
927
917
relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
928
918
if relpath.startswith('/'):