17
17
"""A collection of function for handling URL operations."""
19
from __future__ import absolute_import
23
from urllib import parse as urlparse
28
from urllib import parse as urlparse
30
30
from .lazy_import import lazy_import
31
31
lazy_import(globals(), """
32
32
from posixpath import split as _posix_split
37
class InvalidURL(errors.PathError):
39
_fmt = 'Invalid url supplied to transport: "%(path)s"%(extra)s'
42
class InvalidURLJoin(errors.PathError):
44
_fmt = "Invalid URL join request: %(reason)s: %(base)r + %(join_args)r"
46
def __init__(self, reason, base, join_args):
49
self.join_args = join_args
50
errors.PathError.__init__(self, base, reason)
53
class InvalidRebaseURLs(errors.PathError):
55
_fmt = "URLs differ by more than path: %(from_)r and %(to)r"
57
def __init__(self, from_, to):
60
errors.PathError.__init__(
61
self, from_, 'URLs differ by more than path.')
64
45
def basename(url, exclude_trailing_slash=True):
88
69
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
94
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
72
# Private copies of quote and unquote, copied from Python's
73
# urllib module because urllib unconditionally imports socket, which imports
76
always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
77
b'abcdefghijklmnopqrstuvwxyz'
80
for i, c in zip(range(256), bytes(bytearray(range(256)))):
81
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i).encode('ascii')
85
def quote(s, safe=b'/'):
86
"""quote('abc def') -> 'abc%20def'
88
Each part of a URL, e.g. the path info, the query, etc., has a
89
different set of reserved characters that must be quoted.
91
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
92
the following reserved characters.
94
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
97
Each of these characters is reserved in some component of a URL,
98
but not necessarily in all of them.
100
By default, the quote function is intended for quoting the path
101
section of a URL. Thus, it will not encode '/'. This character
102
is reserved, but in typical usage the quote function is being
103
called on a path where the existing slash characters are used as
109
raise TypeError('None object cannot be quoted')
111
cachekey = (safe, always_safe)
113
(quoter, safe) = _safe_quoters[cachekey]
115
safe_map = _safe_map.copy()
116
safe_map.update([(c, c) for c in safe])
117
quoter = safe_map.__getitem__
118
safe = always_safe + safe
119
_safe_quoters[cachekey] = (quoter, safe)
120
if not s.rstrip(safe):
122
return b''.join(map(quoter, s))
125
_hexdig = '0123456789ABCDEFabcdef'
126
_hextochr = dict((a + b, chr(int(a + b, 16)))
127
for a in _hexdig for b in _hexdig)
130
"""unquote('abc%20def') -> 'abc def'."""
138
s += _hextochr[item[:2]] + item[2:]
141
except UnicodeDecodeError:
142
s += unichr(int(item[:2], 16)) + item[2:]
98
147
"""Escape relpath to be a valid url."""
99
return quote(relpath, safe=safe)
148
if isinstance(relpath, text_type):
149
relpath = relpath.encode('utf-8')
150
return quote(relpath, safe=b'/~')
102
153
def file_relpath(base, path):
185
236
We really should try to have exactly one place in the code base responsible
186
237
for combining paths of URLs.
188
path = base.split('/')
189
if len(path) > 1 and path[-1] == '':
190
# If the path ends in a trailing /, remove it.
239
path = base.split(b'/')
240
if len(path) > 1 and path[-1] == b'':
241
#If the path ends in a trailing /, remove it.
193
if arg.startswith('/'):
244
if arg.startswith(b'/'):
195
for chunk in arg.split('/'):
246
for chunk in arg.split(b'/'):
200
raise InvalidURLJoin('Cannot go above root',
251
raise errors.InvalidURLJoin('Cannot go above root',
204
255
path.append(chunk)
208
return '/'.join(path)
259
return b'/'.join(path)
211
262
# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
212
263
def _posix_local_path_from_url(url):
213
264
"""Convert a url like file:///path/to/foo into /path/to/foo"""
214
url = strip_segment_parameters(url)
215
file_localhost_prefix = 'file://localhost/'
265
url = split_segment_parameters_raw(url)[0]
266
file_localhost_prefix = b'file://localhost/'
216
267
if url.startswith(file_localhost_prefix):
217
268
path = url[len(file_localhost_prefix) - 1:]
218
elif not url.startswith('file:///'):
269
elif not url.startswith(b'file:///'):
270
raise errors.InvalidURL(
220
271
url, 'local urls must start with file:/// or file://localhost/')
222
path = url[len('file://'):]
273
path = url[len(b'file://'):]
223
274
# We only strip off 2 slashes
224
275
return unescape(path)
232
283
# importing directly from posixpath allows us to test this
233
284
# on non-posix platforms
234
return 'file://' + escape(osutils._posix_abspath(path))
285
return b'file://' + escape(osutils._posix_abspath(path))
237
288
def _win32_local_path_from_url(url):
238
289
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
239
290
if not url.startswith('file://'):
240
raise InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
291
raise errors.InvalidURL(url, 'local urls must start with file:///, '
292
'UNC path urls must start with file://')
293
url = split_segment_parameters_raw(url)[0]
243
294
# We strip off all 3 slashes
244
295
win32_url = url[len('file:'):]
245
296
# check for UNC path: //HOST/path
246
297
if not win32_url.startswith('///'):
247
298
if (win32_url[2] == '/'
248
or win32_url[3] in '|:'):
249
raise InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
299
or win32_url[3] in '|:'):
300
raise errors.InvalidURL(url, 'Win32 UNC path urls'
301
' have form file://HOST/path')
251
302
return unescape(win32_url)
253
304
# allow empty paths so we can serve all roots
473
525
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
474
526
# operates on urls not url+segments, and Transport classes
475
# should not be blindly adding slashes in the first place.
527
# should not be blindly adding slashes in the first place.
476
528
lurl = strip_trailing_slash(url)
477
529
# Segments begin at first comma after last forward slash, if one exists
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
530
segment_start = lurl.find(b",", lurl.rfind(b"/")+1)
479
531
if segment_start == -1:
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
533
return (lurl[:segment_start], lurl[segment_start+1:].split(b","))
485
536
def split_segment_parameters(url):
491
542
(base_url, subsegments) = split_segment_parameters_raw(url)
493
544
for subsegment in subsegments:
495
(key, value) = subsegment.split("=", 1)
497
raise InvalidURL(url, "missing = in subsegment")
498
if not isinstance(key, str):
500
if not isinstance(value, str):
501
raise TypeError(value)
545
(key, value) = subsegment.split("=", 1)
502
546
parameters[key] = value
503
547
return (base_url, parameters)
506
def strip_segment_parameters(url):
507
"""Strip the segment parameters from a URL.
509
:param url: A relative or absolute URL
512
base_url, subsegments = split_segment_parameters_raw(url)
516
550
def join_segment_parameters_raw(base, *subsegments):
517
"""Create a new URL by adding subsegments to an existing one.
551
"""Create a new URL by adding subsegments to an existing one.
519
553
This adds the specified subsegments to the last path in the specified
520
554
base URL. The subsegments should be bytestrings.
546
580
new_parameters.update(existing_parameters)
547
581
for key, value in parameters.items():
548
582
if not isinstance(key, str):
549
raise TypeError("parameter key %r is not a str" % key)
583
raise TypeError("parameter key %r is not a bytestring" % key)
550
584
if not isinstance(value, str):
551
raise TypeError("parameter value %r for %r is not a str" %
585
raise TypeError("parameter value %r for %s is not a bytestring" %
554
raise InvalidURLJoin("= exists in parameter key", url,
588
raise errors.InvalidURLJoin("= exists in parameter key", url,
556
590
new_parameters[key] = value
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
591
return join_segment_parameters_raw(base,
592
*["%s=%s" % item for item in sorted(new_parameters.items())])
561
595
def _win32_strip_local_trailing_slash(url):
619
653
# plain ASCII strings, or the final .decode will
620
654
# try to encode the UNICODE => ASCII, and then decode
623
if isinstance(url, str):
656
if isinstance(url, text_type):
658
url = url.encode("ascii")
626
659
except UnicodeError as e:
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
660
raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
662
unquoted = unquote(url)
664
unicode_path = unquoted.decode('utf-8')
665
except UnicodeError as e:
666
raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
632
670
# These are characters that if escaped, should stay that way
633
671
_no_decode_chars = ';/?:@&=+$,#'
634
672
_no_decode_ords = [ord(c) for c in _no_decode_chars]
635
673
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
674
+ ['%02X' % o for o in _no_decode_ords])
675
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
676
+ [('%02X' % o, chr(o)) for o in range(256)]))
677
#These entries get mapped to themselves
678
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
643
680
# These characters shouldn't be percent-encoded, and it's always safe to
644
681
# unencode them if they are.
645
682
_url_dont_escape_characters = set(
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
683
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
684
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
685
"0123456789" # Numbers
686
"-._~" # Unreserved characters
652
689
# These characters should not be escaped
653
690
_url_safe_characters = set(
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
691
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
692
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
693
"0123456789" # Numbers
694
"_.-!~*'()" # Unreserved characters
695
"/;?:@&=+$," # Reserved characters
696
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
705
699
def unescape_for_display(url, encoding):
706
700
"""Decode what you can for a URL, so that we get a nice looking path.
730
724
# Split into sections to try to decode utf-8
731
725
res = url.split('/')
732
726
for i in range(1, len(res)):
733
res[i] = _unescape_segment_for_display(res[i], encoding)
727
escaped_chunks = res[i].split('%')
728
for j in range(1, len(escaped_chunks)):
729
item = escaped_chunks[j]
731
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
733
# Put back the percent symbol
734
escaped_chunks[j] = '%' + item
735
except UnicodeDecodeError:
736
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
737
unescaped = ''.join(escaped_chunks)
739
decoded = unescaped.decode('utf-8')
740
except UnicodeDecodeError:
741
# If this path segment cannot be properly utf-8 decoded
742
# after doing unescaping we will just leave it alone
746
decoded.encode(encoding)
747
except UnicodeEncodeError:
748
# If this chunk cannot be encoded in the local
749
# encoding, then we should leave it alone
752
# Otherwise take the url decoded one
734
754
return u'/'.join(res)
842
860
:param url: URL as bytestring
844
# GZ 2017-06-09: Actually validate ascii-ness
845
# pad.lv/1696545: For the moment, accept both native strings and
847
if isinstance(url, str):
849
elif isinstance(url, str):
852
except UnicodeEncodeError:
853
raise InvalidURL(url)
855
raise InvalidURL(url)
862
if isinstance(url, unicode):
863
raise errors.InvalidURL('should be ascii:\n%r' % url)
864
url = url.encode('utf-8')
856
865
(scheme, netloc, path, params,
857
866
query, fragment) = urlparse.urlparse(url, allow_fragments=False)
858
867
user = password = host = port = None
913
919
:param relpath: relative url string for relative part of remote path.
914
920
:return: urlencoded string for final path.
916
# pad.lv/1696545: For the moment, accept both native strings and
918
if isinstance(relpath, str):
920
elif isinstance(relpath, str):
922
relpath = relpath.encode()
923
except UnicodeEncodeError:
924
raise InvalidURL(relpath)
926
raise InvalidURL(relpath)
922
if not isinstance(relpath, str):
923
raise errors.InvalidURL(relpath)
927
924
relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
928
925
if relpath.startswith('/'):