96
88
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
99
# Private copies of quote and unquote, copied from Python's
100
# urllib module because urllib unconditionally imports socket, which imports
103
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
104
'abcdefghijklmnopqrstuvwxyz'
107
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
108
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
112
def quote(s, safe='/'):
113
"""quote('abc def') -> 'abc%20def'
115
Each part of a URL, e.g. the path info, the query, etc., has a
116
different set of reserved characters that must be quoted.
118
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
119
the following reserved characters.
121
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
124
Each of these characters is reserved in some component of a URL,
125
but not necessarily in all of them.
127
By default, the quote function is intended for quoting the path
128
section of a URL. Thus, it will not encode '/'. This character
129
is reserved, but in typical usage the quote function is being
130
called on a path where the existing slash characters are used as
136
raise TypeError('None object cannot be quoted')
138
cachekey = (safe, always_safe)
140
(quoter, safe) = _safe_quoters[cachekey]
142
safe_map = _safe_map.copy()
143
safe_map.update([(c, c) for c in safe])
144
quoter = safe_map.__getitem__
145
safe = always_safe + safe
146
_safe_quoters[cachekey] = (quoter, safe)
147
if not s.rstrip(safe):
149
return ''.join(map(quoter, s))
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
152
94
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
156
98
"""Escape relpath to be a valid url."""
157
if not isinstance(relpath, str):
158
relpath = relpath.encode('utf-8')
159
return quote(relpath, safe='/~')
99
return quote(relpath, safe=safe)
162
102
def file_relpath(base, path):
298
238
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
299
239
if not url.startswith('file://'):
300
240
raise InvalidURL(url, 'local urls must start with file:///, '
301
'UNC path urls must start with file://')
302
url = split_segment_parameters_raw(url)[0]
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
303
243
# We strip off all 3 slashes
304
244
win32_url = url[len('file:'):]
305
245
# check for UNC path: //HOST/path
306
246
if not win32_url.startswith('///'):
307
247
if (win32_url[2] == '/'
308
or win32_url[3] in '|:'):
248
or win32_url[3] in '|:'):
309
249
raise InvalidURL(url, 'Win32 UNC path urls'
310
' have form file://HOST/path')
250
' have form file://HOST/path')
311
251
return unescape(win32_url)
313
253
# allow empty paths so we can serve all roots
534
473
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
535
474
# operates on urls not url+segments, and Transport classes
536
# should not be blindly adding slashes in the first place.
475
# should not be blindly adding slashes in the first place.
537
476
lurl = strip_trailing_slash(url)
538
477
# Segments begin at first comma after last forward slash, if one exists
539
segment_start = lurl.find(",", lurl.rfind("/")+1)
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
540
479
if segment_start == -1:
542
return (lurl[:segment_start], lurl[segment_start+1:].split(","))
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
545
485
def split_segment_parameters(url):
551
491
(base_url, subsegments) = split_segment_parameters_raw(url)
553
493
for subsegment in subsegments:
554
(key, value) = subsegment.split("=", 1)
495
(key, value) = subsegment.split("=", 1)
497
raise InvalidURL(url, "missing = in subsegment")
498
if not isinstance(key, str):
500
if not isinstance(value, str):
501
raise TypeError(value)
555
502
parameters[key] = value
556
503
return (base_url, parameters)
506
def strip_segment_parameters(url):
507
"""Strip the segment parameters from a URL.
509
:param url: A relative or absolute URL
512
base_url, subsegments = split_segment_parameters_raw(url)
559
516
def join_segment_parameters_raw(base, *subsegments):
560
"""Create a new URL by adding subsegments to an existing one.
517
"""Create a new URL by adding subsegments to an existing one.
562
519
This adds the specified subsegments to the last path in the specified
563
520
base URL. The subsegments should be bytestrings.
589
546
new_parameters.update(existing_parameters)
590
547
for key, value in parameters.items():
591
548
if not isinstance(key, str):
592
raise TypeError("parameter key %r is not a bytestring" % key)
549
raise TypeError("parameter key %r is not a str" % key)
593
550
if not isinstance(value, str):
594
raise TypeError("parameter value %r for %s is not a bytestring" %
551
raise TypeError("parameter value %r for %r is not a str" %
597
554
raise InvalidURLJoin("= exists in parameter key", url,
599
556
new_parameters[key] = value
600
return join_segment_parameters_raw(base,
601
*["%s=%s" % item for item in sorted(new_parameters.items())])
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
604
561
def _win32_strip_local_trailing_slash(url):
662
619
# plain ASCII strings, or the final .decode will
663
620
# try to encode the UNICODE => ASCII, and then decode
665
if isinstance(url, text_type):
623
if isinstance(url, str):
667
url = url.encode("ascii")
668
626
except UnicodeError as e:
669
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
671
unquoted = urlparse.unquote_to_bytes(url)
673
unquoted = unquote(url)
675
unicode_path = unquoted.decode('utf-8')
676
except UnicodeError as e:
677
raise InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
681
632
# These are characters that if escaped, should stay that way
682
633
_no_decode_chars = ';/?:@&=+$,#'
683
634
_no_decode_ords = [ord(c) for c in _no_decode_chars]
684
635
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
685
+ ['%02X' % o for o in _no_decode_ords])
686
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
687
+ [('%02X' % o, chr(o)) for o in range(256)]))
688
#These entries get mapped to themselves
689
_hex_display_map.update((hex, '%'+hex) for hex in _no_decode_hex)
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
691
643
# These characters shouldn't be percent-encoded, and it's always safe to
692
644
# unencode them if they are.
693
645
_url_dont_escape_characters = set(
694
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
695
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
696
"0123456789" # Numbers
697
"-._~" # Unreserved characters
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
700
652
# These characters should not be escaped
701
653
_url_safe_characters = set(
702
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
703
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
704
"0123456789" # Numbers
705
"_.-!~*'()" # Unreserved characters
706
"/;?:@&=+$," # Reserved characters
707
"%#" # Extra reserved characters
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
710
705
def unescape_for_display(url, encoding):
711
706
"""Decode what you can for a URL, so that we get a nice looking path.
735
730
# Split into sections to try to decode utf-8
736
731
res = url.split('/')
737
732
for i in range(1, len(res)):
738
escaped_chunks = res[i].split('%')
739
for j in range(1, len(escaped_chunks)):
740
item = escaped_chunks[j]
742
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
744
# Put back the percent symbol
745
escaped_chunks[j] = '%' + item
746
except UnicodeDecodeError:
747
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
748
unescaped = ''.join(escaped_chunks)
750
decoded = unescaped.decode('utf-8')
751
except UnicodeDecodeError:
752
# If this path segment cannot be properly utf-8 decoded
753
# after doing unescaping we will just leave it alone
757
decoded.encode(encoding)
758
except UnicodeEncodeError:
759
# If this chunk cannot be encoded in the local
760
# encoding, then we should leave it alone
763
# Otherwise take the url decoded one
733
res[i] = _unescape_segment_for_display(res[i], encoding)
765
734
return u'/'.join(res)
930
913
:param relpath: relative url string for relative part of remote path.
931
914
:return: urlencoded string for final path.
933
if not isinstance(relpath, str):
916
# pad.lv/1696545: For the moment, accept both native strings and
918
if isinstance(relpath, str):
920
elif isinstance(relpath, str):
922
relpath = relpath.encode()
923
except UnicodeEncodeError:
924
raise InvalidURL(relpath)
934
926
raise InvalidURL(relpath)
935
927
relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
936
928
if relpath.startswith('/'):