67
68
def __init__(self, from_, to):
70
errors.PathError.__init__(self, from_, 'URLs differ by more than path.')
71
errors.PathError.__init__(
72
self, from_, 'URLs differ by more than path.')
73
75
def basename(url, exclude_trailing_slash=True):
102
104
quote = urlparse.quote
103
105
unquote_to_bytes = urlparse.unquote_to_bytes
105
# Private copies of quote and unquote, copied from Python's
106
# urllib module because urllib unconditionally imports socket, which imports
107
# Private copies of quote and unquote, copied from Python's urllib module
108
# because urllib unconditionally imports socket, which imports ssl.
109
110
always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
110
111
'abcdefghijklmnopqrstuvwxyz'
111
112
'0123456789' '_.-')
113
114
for i, c in zip(range(256), ''.join(map(chr, range(256)))):
114
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
115
_safe_map[c] = c if (
116
i < 128 and c in always_safe) else '%{0:02X}'.format(i)
115
117
_safe_quoters = {}
117
119
def quote_from_bytes(s, safe='/'):
160
162
unquote = urlparse.unquote
165
def escape(relpath, safe='/~'):
164
166
"""Escape relpath to be a valid url."""
165
167
if not isinstance(relpath, str) and sys.version_info[0] == 2:
166
168
relpath = relpath.encode('utf-8')
167
return quote(relpath, safe='/~')
169
return quote(relpath, safe=safe)
170
172
def file_relpath(base, path):
175
177
if len(base) < MIN_ABS_FILEURL_LENGTH:
176
178
raise ValueError('Length of base (%r) must equal or'
177
' exceed the platform minimum url length (which is %d)' %
178
(base, MIN_ABS_FILEURL_LENGTH))
179
' exceed the platform minimum url length (which is %d)' %
180
(base, MIN_ABS_FILEURL_LENGTH))
179
181
base = osutils.normpath(local_path_from_url(base))
180
182
path = osutils.normpath(local_path_from_url(path))
181
183
return escape(osutils.relpath(base, path))
199
201
first_path_slash = path.find('/')
200
202
if first_path_slash == -1:
201
203
return len(scheme), None
202
return len(scheme), first_path_slash+m.start('path')
204
return len(scheme), first_path_slash + m.start('path')
306
308
"""Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
307
309
if not url.startswith('file://'):
308
310
raise InvalidURL(url, 'local urls must start with file:///, '
309
'UNC path urls must start with file://')
311
'UNC path urls must start with file://')
310
312
url = split_segment_parameters_raw(url)[0]
311
313
# We strip off all 3 slashes
312
314
win32_url = url[len('file:'):]
313
315
# check for UNC path: //HOST/path
314
316
if not win32_url.startswith('///'):
315
317
if (win32_url[2] == '/'
316
or win32_url[3] in '|:'):
318
or win32_url[3] in '|:'):
317
319
raise InvalidURL(url, 'Win32 UNC path urls'
318
' have form file://HOST/path')
320
' have form file://HOST/path')
319
321
return unescape(win32_url)
321
323
# allow empty paths so we can serve all roots
325
327
# usual local path with drive letter
326
328
if (len(win32_url) < 6
327
329
or win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
328
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
329
or win32_url[4] not in '|:'
330
or win32_url[5] != '/'):
330
'ABCDEFGHIJKLMNOPQRSTUVWXYZ') or
331
win32_url[4] not in '|:'
332
or win32_url[5] != '/'):
331
333
raise InvalidURL(url, 'Win32 file urls start with'
332
' file:///x:/, where x is a valid drive letter')
334
' file:///x:/, where x is a valid drive letter')
333
335
return win32_url[3].upper() + u':' + unescape(win32_url[5:])
413
415
if c not in _url_safe_characters:
414
416
raise InvalidURL(url, 'URLs can only contain specific'
415
' safe characters (not %r)' % c)
417
' safe characters (not %r)' % c)
416
418
path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
417
419
return str(prefix + ''.join(path))
422
424
for i in range(len(path_chars)):
423
425
if path_chars[i] not in _url_safe_characters:
424
chars = path_chars[i].encode('utf-8')
425
426
path_chars[i] = ''.join(
426
427
['%%%02X' % c for c in bytearray(path_chars[i].encode('utf-8'))])
427
428
path = ''.join(path_chars)
449
450
if base_scheme != other_scheme:
451
452
elif sys.platform == 'win32' and base_scheme == 'file://':
452
base_drive = base[base_first_slash+1:base_first_slash+3]
453
other_drive = other[other_first_slash+1:other_first_slash+3]
453
base_drive = base[base_first_slash + 1:base_first_slash + 3]
454
other_drive = other[other_first_slash + 1:other_first_slash + 3]
454
455
if base_drive != other_drive:
457
base_path = base[base_first_slash+1:]
458
other_path = other[other_first_slash+1:]
458
base_path = base[base_first_slash + 1:]
459
other_path = other[other_first_slash + 1:]
460
461
if base_path.endswith('/'):
461
462
base_path = base_path[:-1]
487
488
# path is currently /C:/foo
488
489
if len(path) < 4 or path[2] not in ':|' or path[3] != '/':
489
490
raise InvalidURL(url_base + path,
490
'win32 file:/// paths need a drive letter')
491
url_base += path[0:3] # file:// + /C:
492
path = path[3:] # /foo
491
'win32 file:/// paths need a drive letter')
492
url_base += path[0:3] # file:// + /C:
493
path = path[3:] # /foo
493
494
return url_base, path
500
501
:param exclude_trailing_slash: Strip off a final '/' if it is part
501
502
of the path (but not if it is part of the protocol specification)
503
:return: (parent_url, child_dir). child_dir may be the empty string if we're at
504
:return: (parent_url, child_dir). child_dir may be the empty string if
506
507
scheme_loc, first_path_slash = _find_scheme_and_separator(url)
519
520
# We have a fully defined path
520
url_base = url[:first_path_slash] # http://host, file://
521
path = url[first_path_slash:] # /file/foo
521
url_base = url[:first_path_slash] # http://host, file://
522
path = url[first_path_slash:] # /file/foo
523
524
if sys.platform == 'win32' and url.startswith('file:///'):
524
525
# Strip off the drive letter
542
543
# GZ 2011-11-18: Dodgy removing the terminal slash like this, function
543
544
# operates on urls not url+segments, and Transport classes
544
# should not be blindly adding slashes in the first place.
545
# should not be blindly adding slashes in the first place.
545
546
lurl = strip_trailing_slash(url)
546
547
# Segments begin at first comma after last forward slash, if one exists
547
segment_start = lurl.find(",", lurl.rfind("/")+1)
548
segment_start = lurl.find(",", lurl.rfind("/") + 1)
548
549
if segment_start == -1:
550
return (lurl[:segment_start], [str(s) for s in lurl[segment_start+1:].split(",")])
551
return (lurl[:segment_start],
552
[str(s) for s in lurl[segment_start + 1:].split(",")])
553
555
def split_segment_parameters(url):
559
561
(base_url, subsegments) = split_segment_parameters_raw(url)
561
563
for subsegment in subsegments:
562
(key, value) = subsegment.split("=", 1)
565
(key, value) = subsegment.split("=", 1)
567
raise InvalidURL(url, "missing = in subsegment")
563
568
if not isinstance(key, str):
564
569
raise TypeError(key)
565
570
if not isinstance(value, str):
571
576
def join_segment_parameters_raw(base, *subsegments):
572
"""Create a new URL by adding subsegments to an existing one.
577
"""Create a new URL by adding subsegments to an existing one.
574
579
This adds the specified subsegments to the last path in the specified
575
580
base URL. The subsegments should be bytestrings.
583
588
raise TypeError("Subsegment %r is not a bytestring" % subsegment)
584
589
if "," in subsegment:
585
590
raise InvalidURLJoin(", exists in subsegments",
587
592
return ",".join((base,) + subsegments)
604
609
raise TypeError("parameter key %r is not a str" % key)
605
610
if not isinstance(value, str):
606
611
raise TypeError("parameter value %r for %r is not a str" %
609
614
raise InvalidURLJoin("= exists in parameter key", url,
611
616
new_parameters[key] = value
612
return join_segment_parameters_raw(base,
613
*["%s=%s" % item for item in sorted(new_parameters.items())])
617
return join_segment_parameters_raw(
618
base, *["%s=%s" % item for item in sorted(new_parameters.items())])
616
621
def _win32_strip_local_trailing_slash(url):
681
686
url.encode("ascii")
682
687
except UnicodeError as e:
683
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
689
url, 'URL was not a plain ASCII url: %s' % (e,))
684
690
return urlparse.unquote(url)
686
692
if isinstance(url, text_type):
688
694
url = url.encode("ascii")
689
695
except UnicodeError as e:
690
raise InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
697
url, 'URL was not a plain ASCII url: %s' % (e,))
691
698
unquoted = unquote(url)
693
700
unicode_path = unquoted.decode('utf-8')
694
701
except UnicodeError as e:
695
raise InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
703
url, 'Unable to encode the URL as utf-8: %s' % (e,))
696
704
return unicode_path
700
708
_no_decode_chars = ';/?:@&=+$,#'
701
709
_no_decode_ords = [ord(c) for c in _no_decode_chars]
702
710
_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
703
+ ['%02X' % o for o in _no_decode_ords])
711
+ ['%02X' % o for o in _no_decode_ords])
704
712
_hex_display_map = dict(([('%02x' % o, int2byte(o)) for o in range(256)]
705
+ [('%02X' % o, int2byte(o)) for o in range(256)]))
706
#These entries get mapped to themselves
707
_hex_display_map.update((hex, b'%'+hex.encode('ascii')) for hex in _no_decode_hex)
713
+ [('%02X' % o, int2byte(o)) for o in range(256)]))
714
# These entries get mapped to themselves
715
_hex_display_map.update((hex, b'%' + hex.encode('ascii'))
716
for hex in _no_decode_hex)
709
718
# These characters shouldn't be percent-encoded, and it's always safe to
710
719
# unencode them if they are.
711
720
_url_dont_escape_characters = set(
712
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
713
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
714
"0123456789" # Numbers
715
"-._~" # Unreserved characters
721
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
722
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
723
"0123456789" # Numbers
724
"-._~" # Unreserved characters
718
727
# These characters should not be escaped
719
728
_url_safe_characters = set(
720
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
721
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
722
"0123456789" # Numbers
723
"_.-!~*'()" # Unreserved characters
724
"/;?:@&=+$," # Reserved characters
725
"%#" # Extra reserved characters
729
"abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
730
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
731
"0123456789" # Numbers
732
"_.-!~*'()" # Unreserved characters
733
"/;?:@&=+$," # Reserved characters
734
"%#" # Extra reserved characters
745
754
escaped_chunks[j] = _hex_display_map[item[:2]]
747
756
# Put back the percent symbol
748
escaped_chunks[j] = b'%' + (item[:2].encode('utf-8') if PY3 else item[:2])
757
escaped_chunks[j] = b'%' + \
758
(item[:2].encode('utf-8') if PY3 else item[:2])
749
759
except UnicodeDecodeError:
750
760
escaped_chunks[j] = unichr(int(item[:2], 16)).encode('utf-8')
751
escaped_chunks[j] += (item[2:].encode('utf-8') if PY3 else item[2:])
761
escaped_chunks[j] += (item[2:].encode('utf-8') if PY3 else item[2:])
752
762
unescaped = b''.join(escaped_chunks)
754
764
decoded = unescaped.decode('utf-8')
851
861
to_segments = osutils.splitpath(to_path)
853
863
for count, (from_element, to_element) in enumerate(zip(from_segments,
855
865
if from_element != to_element:
868
878
"""Parsed URL."""
870
880
def __init__(self, scheme, quoted_user, quoted_password, quoted_host,
872
882
self.scheme = scheme
873
883
self.quoted_host = quoted_host
874
884
self.host = unquote(self.quoted_host)
884
894
self.password = None
886
self.quoted_path = _url_hex_escapes_re.sub(_unescape_safe_chars, quoted_path)
896
self.quoted_path = _url_hex_escapes_re.sub(
897
_unescape_safe_chars, quoted_path)
887
898
self.path = unquote(self.quoted_path)
889
900
def __eq__(self, other):
907
918
:param url: URL as bytestring
909
920
# GZ 2017-06-09: Actually validate ascii-ness
910
# pad.lv/1696545: For the moment, accept both native strings and unicode.
921
# pad.lv/1696545: For the moment, accept both native strings and
911
923
if isinstance(url, str):
913
925
elif isinstance(url, text_type):
941
if host != "" and host[0] == '[' and host[-1] == ']': #IPv6
953
if host != "" and host[0] == '[' and host[-1] == ']': # IPv6
942
954
host = host[1:-1]
944
956
return cls(scheme, user, password, host, port, path)
977
989
:param relpath: relative url string for relative part of remote path.
978
990
:return: urlencoded string for final path.
980
# pad.lv/1696545: For the moment, accept both native strings and unicode.
992
# pad.lv/1696545: For the moment, accept both native strings and
981
994
if isinstance(relpath, str):
983
996
elif isinstance(relpath, text_type):
1026
1039
path = self.quoted_path
1027
1040
return self.__class__(self.scheme, self.quoted_user,
1028
self.quoted_password, self.quoted_host, self.port,
1041
self.quoted_password, self.quoted_host, self.port,
1032
1045
def parse_url(url):
1042
1055
parsed_url = URL.from_string(url)
1043
1056
return (parsed_url.scheme, parsed_url.user, parsed_url.password,
1044
parsed_url.host, parsed_url.port, parsed_url.path)
1057
parsed_url.host, parsed_url.port, parsed_url.path)