88
96
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
99
# Private copies of quote and unquote, copied from Python's
100
# urllib module because urllib unconditionally imports socket, which imports
103
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
104
'abcdefghijklmnopqrstuvwxyz'
107
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
108
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
112
def quote(s, safe='/'):
113
"""quote('abc def') -> 'abc%20def'
115
Each part of a URL, e.g. the path info, the query, etc., has a
116
different set of reserved characters that must be quoted.
118
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
119
the following reserved characters.
121
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
124
Each of these characters is reserved in some component of a URL,
125
but not necessarily in all of them.
127
By default, the quote function is intended for quoting the path
128
section of a URL. Thus, it will not encode '/'. This character
129
is reserved, but in typical usage the quote function is being
130
called on a path where the existing slash characters are used as
136
raise TypeError('None object cannot be quoted')
138
cachekey = (safe, always_safe)
140
(quoter, safe) = _safe_quoters[cachekey]
142
safe_map = _safe_map.copy()
143
safe_map.update([(c, c) for c in safe])
144
quoter = safe_map.__getitem__
145
safe = always_safe + safe
146
_safe_quoters[cachekey] = (quoter, safe)
147
if not s.rstrip(safe):
149
return ''.join(map(quoter, s))
94
152
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
98
156
"""Escape relpath to be a valid url."""
99
return quote(relpath, safe=safe)
157
if not isinstance(relpath, str):
158
relpath = relpath.encode('utf-8')
159
return quote(relpath, safe='/~')
102
162
def file_relpath(base, path):
238
298
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
239
299
if not url.startswith('file://'):
240
300
raise InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
301
'UNC path urls must start with file://')
302
url = split_segment_parameters_raw(url)[0]
243
303
# We strip off all 3 slashes
244
304
win32_url = url[len('file:'):]
245
305
# check for UNC path: //HOST/path
246
306
if not win32_url.startswith('///'):
247
307
if (win32_url[2] == '/'
248
or win32_url[3] in '|:'):
308
or win32_url[3] in '|:'):
249
309
raise InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
310
' have form file://HOST/path')
251
311
return unescape(win32_url)
253
313
# allow empty paths so we can serve all roots
473
534
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
474
535
# operates on urls not url+segments, and Transport classes
475
# should not be blindly adding slashes in the first place.
536
# should not be blindly adding slashes in the first place.
476
537
lurl = strip_trailing_slash(url)
477
538
# Segments begin at first comma after last forward slash, if one exists
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
539
segment_start = lurl.find(",", lurl.rfind("/")+1)
479
540
if segment_start == -1:
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
542
return (lurl[:segment_start], lurl[segment_start+1:].split(","))
485
545
def split_segment_parameters(url):
491
551
(base_url, subsegments) = split_segment_parameters_raw(url)
493
553
for subsegment in subsegments:
495
(key, value) = subsegment.split("=", 1)
497
raise InvalidURL(url, "missing = in subsegment")
498
if not isinstance(key, str):
500
if not isinstance(value, str):
501
raise TypeError(value)
554
(key, value) = subsegment.split("=", 1)
502
555
parameters[key] = value
503
556
return (base_url, parameters)
506
def strip_segment_parameters(url):
507
"""Strip the segment parameters from a URL.
509
:param url: A relative or absolute URL
512
base_url, subsegments = split_segment_parameters_raw(url)
516
559
def join_segment_parameters_raw(base, *subsegments):
517
"""Create a new URL by adding subsegments to an existing one.
560
"""Create a new URL by adding subsegments to an existing one.
519
562
This adds the specified subsegments to the last path in the specified
520
563
base URL. The subsegments should be bytestrings.
546
589
new_parameters.update(existing_parameters)
547
590
for key, value in parameters.items():
548
591
if not isinstance(key, str):
549
raise TypeError("parameter key %r is not a str" % key)
592
raise TypeError("parameter key %r is not a bytestring" % key)
550
593
if not isinstance(value, str):
551
raise TypeError("parameter value %r for %r is not a str" %
594
raise TypeError("parameter value %r for %s is not a bytestring" %
554
597
raise InvalidURLJoin("= exists in parameter key", url,
556
599
new_parameters[key] = value
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
600
return join_segment_parameters_raw(base,
601
*["%s=%s" % item for item in sorted(new_parameters.items())])
561
604
def _win32_strip_local_trailing_slash(url):
619
662
# plain ASCII strings, or the final .decode will
620
663
# try to encode the UNICODE => ASCII, and then decode
623
if isinstance(url, str):
665
if isinstance(url, text_type):
667
url = url.encode("ascii")
626
668
except UnicodeError as e:
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
669
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
671
unquoted = urlparse.unquote_to_bytes(url)
673
unquoted = unquote(url)
675
unicode_path = unquoted.decode('utf-8')
676
except UnicodeError as e:
677
raise InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
632
681
# These are characters that if escaped, should stay that way
633
682
_no_decode_chars = ';/?:@&=+$,#'
634
683
_no_decode_ords = [ord(c) for c in _no_decode_chars]
635
684
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
685
+ ['%02X' % o for o in _no_decode_ords])
686
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
687
+ [('%02X' % o, chr(o)) for o in range(256)]))
688
#These entries get mapped to themselves
689
_hex_display_map.update((hex, '%'+hex) for hex in _no_decode_hex)
643
691
# These characters shouldn't be percent-encoded, and it's always safe to
644
692
# unencode them if they are.
645
693
_url_dont_escape_characters = set(
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
694
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
695
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
696
"0123456789" # Numbers
697
"-._~" # Unreserved characters
652
700
# These characters should not be escaped
653
701
_url_safe_characters = set(
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
702
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
703
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
704
"0123456789" # Numbers
705
"_.-!~*'()" # Unreserved characters
706
"/;?:@&=+$," # Reserved characters
707
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
705
710
def unescape_for_display(url, encoding):
706
711
"""Decode what you can for a URL, so that we get a nice looking path.
730
735
# Split into sections to try to decode utf-8
731
736
res = url.split('/')
732
737
for i in range(1, len(res)):
733
res[i] = _unescape_segment_for_display(res[i], encoding)
738
escaped_chunks = res[i].split('%')
739
for j in range(1, len(escaped_chunks)):
740
item = escaped_chunks[j]
742
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
744
# Put back the percent symbol
745
escaped_chunks[j] = '%' + item
746
except UnicodeDecodeError:
747
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
748
unescaped = ''.join(escaped_chunks)
750
decoded = unescaped.decode('utf-8')
751
except UnicodeDecodeError:
752
# If this path segment cannot be properly utf-8 decoded
753
# after doing unescaping we will just leave it alone
757
decoded.encode(encoding)
758
except UnicodeEncodeError:
759
# If this chunk cannot be encoded in the local
760
# encoding, then we should leave it alone
763
# Otherwise take the url decoded one
734
765
return u'/'.join(res)
913
930
:param relpath: relative url string for relative part of remote path.
914
931
:return: urlencoded string for final path.
916
# pad.lv/1696545: For the moment, accept both native strings and
918
if isinstance(relpath, str):
920
elif isinstance(relpath, str):
922
relpath = relpath.encode()
923
except UnicodeEncodeError:
924
raise InvalidURL(relpath)
933
if not isinstance(relpath, str):
926
934
raise InvalidURL(relpath)
927
935
relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
928
936
if relpath.startswith('/'):