88
96
return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
91
quote_from_bytes = urlparse.quote_from_bytes
92
quote = urlparse.quote
93
unquote_to_bytes = urlparse.unquote_to_bytes
99
# Private copies of quote and unquote, copied from Python's
100
# urllib module because urllib unconditionally imports socket, which imports
103
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
104
'abcdefghijklmnopqrstuvwxyz'
107
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
108
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
112
def quote(s, safe='/'):
113
"""quote('abc def') -> 'abc%20def'
115
Each part of a URL, e.g. the path info, the query, etc., has a
116
different set of reserved characters that must be quoted.
118
RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
119
the following reserved characters.
121
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
124
Each of these characters is reserved in some component of a URL,
125
but not necessarily in all of them.
127
By default, the quote function is intended for quoting the path
128
section of a URL. Thus, it will not encode '/'. This character
129
is reserved, but in typical usage the quote function is being
130
called on a path where the existing slash characters are used as
136
raise TypeError('None object cannot be quoted')
138
cachekey = (safe, always_safe)
140
(quoter, safe) = _safe_quoters[cachekey]
142
safe_map = _safe_map.copy()
143
safe_map.update([(c, c) for c in safe])
144
quoter = safe_map.__getitem__
145
safe = always_safe + safe
146
_safe_quoters[cachekey] = (quoter, safe)
147
if not s.rstrip(safe):
149
return ''.join(map(quoter, s))
94
152
unquote = urlparse.unquote
97
def escape(relpath, safe='/~'):
98
156
"""Escape relpath to be a valid url."""
99
return quote(relpath, safe=safe)
157
if not isinstance(relpath, str):
158
relpath = relpath.encode('utf-8')
159
return quote(relpath, safe='/~')
102
162
def file_relpath(base, path):
238
298
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
239
299
if not url.startswith('file://'):
240
300
raise InvalidURL(url, 'local urls must start with file:///, '
241
'UNC path urls must start with file://')
242
url = strip_segment_parameters(url)
301
'UNC path urls must start with file://')
302
url = split_segment_parameters_raw(url)[0]
243
303
# We strip off all 3 slashes
244
304
win32_url = url[len('file:'):]
245
305
# check for UNC path: //HOST/path
246
306
if not win32_url.startswith('///'):
247
307
if (win32_url[2] == '/'
248
or win32_url[3] in '|:'):
308
or win32_url[3] in '|:'):
249
309
raise InvalidURL(url, 'Win32 UNC path urls'
250
' have form file://HOST/path')
310
' have form file://HOST/path')
251
311
return unescape(win32_url)
253
313
# allow empty paths so we can serve all roots
473
534
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
474
535
# operates on urls not url+segments, and Transport classes
475
# should not be blindly adding slashes in the first place.
536
# should not be blindly adding slashes in the first place.
476
537
lurl = strip_trailing_slash(url)
477
538
# Segments begin at first comma after last forward slash, if one exists
478
segment_start = lurl.find(",", lurl.rfind("/") + 1)
539
segment_start = lurl.find(",", lurl.rfind("/")+1)
479
540
if segment_start == -1:
481
return (lurl[:segment_start],
482
[str(s) for s in lurl[segment_start + 1:].split(",")])
542
return (lurl[:segment_start], [str(s) for s in lurl[segment_start+1:].split(",")])
485
545
def split_segment_parameters(url):
546
593
new_parameters.update(existing_parameters)
547
594
for key, value in parameters.items():
548
595
if not isinstance(key, str):
549
raise TypeError("parameter key %r is not a str" % key)
596
raise TypeError("parameter key %r is not a bytestring" % key)
550
597
if not isinstance(value, str):
551
raise TypeError("parameter value %r for %r is not a str" %
598
raise TypeError("parameter value %r for %s is not a bytestring" %
554
601
raise InvalidURLJoin("= exists in parameter key", url,
556
603
new_parameters[key] = value
557
return join_segment_parameters_raw(
558
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
604
return join_segment_parameters_raw(base,
605
*["%s=%s" % item for item in sorted(new_parameters.items())])
561
608
def _win32_strip_local_trailing_slash(url):
619
666
# plain ASCII strings, or the final .decode will
620
667
# try to encode the UNICODE => ASCII, and then decode
623
if isinstance(url, str):
669
if isinstance(url, text_type):
671
url = url.encode("ascii")
626
672
except UnicodeError as e:
628
url, 'URL was not a plain ASCII url: %s' % (e,))
629
return urlparse.unquote(url)
673
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
675
unquoted = urlparse.unquote_to_bytes(url)
677
unquoted = unquote(url)
679
unicode_path = unquoted.decode('utf-8')
680
except UnicodeError as e:
681
raise InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
632
685
# These are characters that if escaped, should stay that way
633
686
_no_decode_chars = ';/?:@&=+$,#'
634
687
_no_decode_ords = [ord(c) for c in _no_decode_chars]
635
688
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
636
+ ['%02X' % o for o in _no_decode_ords])
637
_hex_display_map = dict(([('%02x' % o, bytes([o])) for o in range(256)]
638
+ [('%02X' % o, bytes([o])) for o in range(256)]))
639
# These entries get mapped to themselves
640
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
641
for hex in _no_decode_hex)
689
+ ['%02X' % o for o in _no_decode_ords])
690
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
691
+ [('%02X' % o, chr(o)) for o in range(256)]))
692
#These entries get mapped to themselves
693
_hex_display_map.update((hex, '%'+hex) for hex in _no_decode_hex)
643
695
# These characters shouldn't be percent-encoded, and it's always safe to
644
696
# unencode them if they are.
645
697
_url_dont_escape_characters = set(
646
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
647
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
648
"0123456789" # Numbers
649
"-._~" # Unreserved characters
698
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
699
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
700
"0123456789" # Numbers
701
"-._~" # Unreserved characters
652
704
# These characters should not be escaped
653
705
_url_safe_characters = set(
654
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
655
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
656
"0123456789" # Numbers
657
"_.-!~*'()" # Unreserved characters
658
"/;?:@&=+$," # Reserved characters
659
"%#" # Extra reserved characters
706
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
707
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
708
"0123456789" # Numbers
709
"_.-!~*'()" # Unreserved characters
710
"/;?:@&=+$," # Reserved characters
711
"%#" # Extra reserved characters
663
def _unescape_segment_for_display(segment, encoding):
664
"""Unescape a segment for display.
666
Helper for unescape_for_display
668
:param url: A 7-bit ASCII URL
669
:param encoding: The final output encoding
671
:return: A unicode string which can be safely encoded into the
674
escaped_chunks = segment.split('%')
675
escaped_chunks[0] = escaped_chunks[0].encode('utf-8')
676
for j in range(1, len(escaped_chunks)):
677
item = escaped_chunks[j]
679
escaped_chunks[j] = _hex_display_map[item[:2]]
681
# Put back the percent symbol
682
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8'))
683
except UnicodeDecodeError:
684
escaped_chunks[j] = chr(int(item[:2], 16)).encode('utf-8')
685
escaped_chunks[j] += (item[2:].encode('utf-8'))
686
unescaped = b''.join(escaped_chunks)
688
decoded = unescaped.decode('utf-8')
689
except UnicodeDecodeError:
690
# If this path segment cannot be properly utf-8 decoded
691
# after doing unescaping we will just leave it alone
695
decoded.encode(encoding)
696
except UnicodeEncodeError:
697
# If this chunk cannot be encoded in the local
698
# encoding, then we should leave it alone
701
# Otherwise take the url decoded one
705
714
def unescape_for_display(url, encoding):
706
715
"""Decode what you can for a URL, so that we get a nice looking path.
730
739
# Split into sections to try to decode utf-8
731
740
res = url.split('/')
732
741
for i in range(1, len(res)):
733
res[i] = _unescape_segment_for_display(res[i], encoding)
742
escaped_chunks = res[i].split('%')
743
for j in range(1, len(escaped_chunks)):
744
item = escaped_chunks[j]
746
escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
748
# Put back the percent symbol
749
escaped_chunks[j] = '%' + item
750
except UnicodeDecodeError:
751
escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
752
unescaped = ''.join(escaped_chunks)
753
if sys.version_info[0] == 2:
755
decoded = unescaped.decode('utf-8')
756
except UnicodeDecodeError:
757
# If this path segment cannot be properly utf-8 decoded
758
# after doing unescaping we will just leave it alone
762
decoded.encode(encoding)
763
except UnicodeEncodeError:
764
# If this chunk cannot be encoded in the local
765
# encoding, then we should leave it alone
768
# Otherwise take the url decoded one
734
772
return u'/'.join(res)