99
96
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
103
quote_from_bytes = urlparse.quote_from_bytes
104
quote = urlparse.quote
105
unquote_to_bytes = urlparse.unquote_to_bytes
107
# Private copies of quote and unquote, copied from Python's urllib module
108
# because urllib unconditionally imports socket, which imports ssl.
110
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
111
'abcdefghijklmnopqrstuvwxyz'
114
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
115
_safe_map[c] = c if (
116
i < 128 and c in always_safe) else '%{0:02X}'.format(i)
119
def quote_from_bytes(s, safe='/'):
120
"""quote('abc def') -> 'abc%20def'
122
Each part of a URL, e.g. the path info, the query, etc., has a
123
different set of reserved characters that must be quoted.
125
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
126
the following reserved characters.
128
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
131
Each of these characters is reserved in some component of a URL,
132
but not necessarily in all of them.
134
By default, the quote function is intended for quoting the path
135
section of a URL. Thus, it will not encode '/'. This character
136
is reserved, but in typical usage the quote function is being
137
called on a path where the existing slash characters are used as
143
raise TypeError('None object cannot be quoted')
145
cachekey = (safe, always_safe)
147
(quoter, safe) = _safe_quoters[cachekey]
149
safe_map = _safe_map.copy()
150
safe_map.update([(c, c) for c in safe])
151
quoter = safe_map.__getitem__
152
safe = always_safe + safe
153
_safe_quoters[cachekey] = (quoter, safe)
154
if not s.rstrip(safe):
156
return ''.join(map(quoter, s))
158
quote = quote_from_bytes
159
unquote_to_bytes = urlparse.unquote
99
# Private copies of quote and unquote, copied from Python's
100
# urllib module because urllib unconditionally imports socket, which imports
103
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
104
'abcdefghijklmnopqrstuvwxyz'
107
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
108
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
112
def quote(s, safe='/'):
113
"""quote('abc def') -> 'abc%20def'
115
Each part of a URL, e.g. the path info, the query, etc., has a
116
different set of reserved characters that must be quoted.
118
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
119
the following reserved characters.
121
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
124
Each of these characters is reserved in some component of a URL,
125
but not necessarily in all of them.
127
By default, the quote function is intended for quoting the path
128
section of a URL. Thus, it will not encode '/'. This character
129
is reserved, but in typical usage the quote function is being
130
called on a path where the existing slash characters are used as
136
raise TypeError('None object cannot be quoted')
138
cachekey = (safe, always_safe)
140
(quoter, safe) = _safe_quoters[cachekey]
142
safe_map = _safe_map.copy()
143
safe_map.update([(c, c) for c in safe])
144
quoter = safe_map.__getitem__
145
safe = always_safe + safe
146
_safe_quoters[cachekey] = (quoter, safe)
147
if not s.rstrip(safe):
149
return ''.join(map(quoter, s))
162
152
unquote = urlparse.unquote
165
def escape(relpath, safe='/~'):
166
156
"""Escape relpath to be a valid url."""
167
if not isinstance(relpath, str) and sys.version_info[0] == 2:
168
# GZ 2019-06-16: Should use _fs_enc instead here really?
157
if not isinstance(relpath, str):
169
158
relpath = relpath.encode('utf-8')
170
return quote(relpath, safe=safe)
159
return quote(relpath, safe='/~')
173
162
def file_relpath(base, path):
309
298
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
310
299
if not url.startswith('file://'):
311
300
raise InvalidURL(url, 'local urls must start with file:///, '
312
'UNC path urls must start with file://')
313
url = strip_segment_parameters(url)
301
'UNC path urls must start with file://')
302
url = split_segment_parameters_raw(url)[0]
314
303
# We strip off all 3 slashes
315
304
win32_url = url[len('file:'):]
316
305
# check for UNC path: //HOST/path
317
306
if not win32_url.startswith('///'):
318
307
if (win32_url[2] == '/'
319
or win32_url[3] in '|:'):
308
or win32_url[3] in '|:'):
320
309
raise InvalidURL(url, 'Win32 UNC path urls'
321
' have form file://HOST/path')
310
' have form file://HOST/path')
322
311
return unescape(win32_url)
324
313
# allow empty paths so we can serve all roots
544
534
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
545
535
# operates on urls not url+segments, and Transport classes
546
# should not be blindly adding slashes in the first place.
536
# should not be blindly adding slashes in the first place.
547
537
lurl = strip_trailing_slash(url)
548
538
# Segments begin at first comma after last forward slash, if one exists
549
segment_start = lurl.find(",", lurl.rfind("/") + 1)
539
segment_start = lurl.find(",", lurl.rfind("/")+1)
550
540
if segment_start == -1:
552
return (lurl[:segment_start],
553
[str(s) for s in lurl[segment_start + 1:].split(",")])
542
return (lurl[:segment_start], lurl[segment_start+1:].split(","))
556
545
def split_segment_parameters(url):
562
551
(base_url, subsegments) = split_segment_parameters_raw(url)
564
553
for subsegment in subsegments:
566
(key, value) = subsegment.split("=", 1)
568
raise InvalidURL(url, "missing = in subsegment")
569
if not isinstance(key, str):
571
if not isinstance(value, str):
572
raise TypeError(value)
554
(key, value) = subsegment.split("=", 1)
573
555
parameters[key] = value
574
556
return (base_url, parameters)
577
def strip_segment_parameters(url):
578
"""Strip the segment parameters from a URL.
580
:param url: A relative or absolute URL
583
base_url, subsegments = split_segment_parameters_raw(url)
587
559
def join_segment_parameters_raw(base, *subsegments):
588
"""Create a new URL by adding subsegments to an existing one.
560
"""Create a new URL by adding subsegments to an existing one.
590
562
This adds the specified subsegments to the last path in the specified
591
563
base URL. The subsegments should be bytestrings.
617
589
new_parameters.update(existing_parameters)
618
590
for key, value in parameters.items():
619
591
if not isinstance(key, str):
620
raise TypeError("parameter key %r is not a str" % key)
592
raise TypeError("parameter key %r is not a bytestring" % key)
621
593
if not isinstance(value, str):
622
raise TypeError("parameter value %r for %r is not a str" %
594
raise TypeError("parameter value %r for %s is not a bytestring" %
625
597
raise InvalidURLJoin("= exists in parameter key", url,
627
599
new_parameters[key] = value
628
return join_segment_parameters_raw(
629
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
600
return join_segment_parameters_raw(base,
601
*["%s=%s" % item for item in sorted(new_parameters.items())])
632
604
def _win32_strip_local_trailing_slash(url):
690
662
# plain ASCII strings, or the final .decode will
691
663
# try to encode the UNICODE => ASCII, and then decode
665
if isinstance(url, text_type):
667
url = url.encode("ascii")
668
except UnicodeError as e:
669
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
695
if isinstance(url, text_type):
698
except UnicodeError as e:
700
url, 'URL was not a plain ASCII url: %s' % (e,))
701
return urlparse.unquote(url)
671
unquoted = urlparse.unquote_to_bytes(url)
703
if isinstance(url, text_type):
705
url = url.encode("ascii")
706
except UnicodeError as e:
708
url, 'URL was not a plain ASCII url: %s' % (e,))
709
673
unquoted = unquote(url)
711
unicode_path = unquoted.decode('utf-8')
712
except UnicodeError as e:
714
url, 'Unable to encode the URL as utf-8: %s' % (e,))
675
unicode_path = unquoted.decode('utf-8')
676
except UnicodeError as e:
677
raise InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
718
681
# These are characters that if escaped, should stay that way
719
682
_no_decode_chars = ';/?:@&=+$,#'
720
683
_no_decode_ords = [ord(c) for c in _no_decode_chars]
721
684
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
722
+ ['%02X' % o for o in _no_decode_ords])
723
_hex_display_map = dict(([('%02x' % o, int2byte(o)) for o in range(256)]
724
+ [('%02X' % o, int2byte(o)) for o in range(256)]))
725
# These entries get mapped to themselves
726
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
727
for hex in _no_decode_hex)
685
+ ['%02X' % o for o in _no_decode_ords])
686
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
687
+ [('%02X' % o, chr(o)) for o in range(256)]))
688
#These entries get mapped to themselves
689
_hex_display_map.update((hex, '%'+hex) for hex in _no_decode_hex)
729
691
# These characters shouldn't be percent-encoded, and it's always safe to
730
692
# unencode them if they are.
731
693
_url_dont_escape_characters = set(
732
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
733
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
734
"0123456789" # Numbers
735
"-._~" # Unreserved characters
694
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
695
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
696
"0123456789" # Numbers
697
"-._~" # Unreserved characters
738
700
# These characters should not be escaped
739
701
_url_safe_characters = set(
740
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
741
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
742
"0123456789" # Numbers
743
"_.-!~*'()" # Unreserved characters
744
"/;?:@&=+$," # Reserved characters
745
"%#" # Extra reserved characters
702
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
703
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
704
"0123456789" # Numbers
705
"_.-!~*'()" # Unreserved characters
706
"/;?:@&=+$," # Reserved characters
707
"%#" # Extra reserved characters
749
def _unescape_segment_for_display(segment, encoding):
750
"""Unescape a segment for display.
752
Helper for unescape_for_display
754
:param url: A 7-bit ASCII URL
755
:param encoding: The final output encoding
757
:return: A unicode string which can be safely encoded into the
760
escaped_chunks = segment.split('%')
761
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
762
for j in range(1, len(escaped_chunks)):
763
item = escaped_chunks[j]
765
escaped_chunks[j] = _hex_display_map[item[:2]]
767
# Put back the percent symbol
768
escaped_chunks[j] = b'%' + \
769
(item[:2].encode('utf-8') if PY3 else item[:2])
770
except UnicodeDecodeError:
771
escaped_chunks[j] = unichr(int(item[:2], 16)).encode('utf-8')
772
escaped_chunks[j] += (item[2:].encode('utf-8') if PY3 else item[2:])
773
unescaped = b''.join(escaped_chunks)
775
decoded = unescaped.decode('utf-8')
776
except UnicodeDecodeError:
777
# If this path segment cannot be properly utf-8 decoded
778
# after doing unescaping we will just leave it alone
782
decoded.encode(encoding)
783
except UnicodeEncodeError:
784
# If this chunk cannot be encoded in the local
785
# encoding, then we should leave it alone
788
# Otherwise take the url decoded one
792
710
def unescape_for_display(url, encoding):
793
711
"""Decode what you can for a URL, so that we get a nice looking path.
817
735
# Split into sections to try to decode utf-8
818
736
res = url.split('/')
819
737
for i in range(1, len(res)):
820
res[i] = _unescape_segment_for_display(res[i], encoding)
738
escaped_chunks = res[i].split('%')
739
for j in range(1, len(escaped_chunks)):
740
item = escaped_chunks[j]
742
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
744
# Put back the percent symbol
745
escaped_chunks[j] = '%' + item
746
except UnicodeDecodeError:
747
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
748
unescaped = ''.join(escaped_chunks)
750
decoded = unescaped.decode('utf-8')
751
except UnicodeDecodeError:
752
# If this path segment cannot be properly utf-8 decoded
753
# after doing unescaping we will just leave it alone
757
decoded.encode(encoding)
758
except UnicodeEncodeError:
759
# If this chunk cannot be encoded in the local
760
# encoding, then we should leave it alone
763
# Otherwise take the url decoded one
821
765
return u'/'.join(res)