/brz/remove-bazaar : contents of bzrlib/urlutils.py at revision 1685.1.51

: (revision 1685.1.51)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# Bazaar-NG -- distributed version control
#
# Copyright (C) 2006 by Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""A collection of function for handling URL operations."""

import os
from posixpath import split as _posix_split
import re
import sys
import urllib

import bzrlib.errors as errors
import bzrlib.osutils


def basename(url, exclude_trailing_slash=True):
    """Return the last component of a URL.

    :param url: The URL in question
    :param exclude_trailing_slash: If the url looks like "path/to/foo/"
        ignore the final slash and return 'foo' rather than ''
    :return: Just the final component of the URL. This can return ''
        if you don't exclude_trailing_slash, or if you are at the
        root of the URL.
    """
    return split(url, exclude_trailing_slash=exclude_trailing_slash)[1]


def dirname(url, exclude_trailing_slash=True):
    """Return the parent directory of the given path.

    :param url: Relative or absolute URL
    :param exclude_trailing_slash: Remove a final slash
        (treat http://host/foo/ as http://host/foo, but
        http://host/ stays http://host/)
    :return: Everything in the URL except the last path chunk
    """
    # TODO: jam 20060502 This was named dirname to be consistent
    #       with the os functions, but maybe "parent" would be better
    return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]


def escape(relpath):
    """Escape relpath to be a valid url."""
    if isinstance(relpath, unicode):
        relpath = relpath.encode('utf-8')
    # After quoting and encoding, the path should be perfectly
    # safe as a plain ASCII string, str() just enforces this
    return str(urllib.quote(relpath))


def file_relpath(base, path):
    """Compute just the relative sub-portion of a url
    
    This assumes that both paths are already fully specified file:// URLs.
    """
    assert len(base) >= MIN_ABS_FILEURL_LENGTH, ('Length of base must be equal or'
        ' exceed the platform minimum url length (which is %d)' % 
        MIN_ABS_FILEURL_LENGTH)

    base = local_path_from_url(base)
    path = local_path_from_url(path)
    return escape(bzrlib.osutils.relpath(base, path))


def _find_scheme_and_separator(url):
    """Find the scheme separator (://) and the first path separator

    This is just a helper functions for other path utilities.
    It could probably be replaced by urlparse
    """
    m = _url_scheme_re.match(url)
    if not m:
        return None, None

    scheme = m.group('scheme')
    path = m.group('path')

    # Find the path separating slash
    # (first slash after the ://)
    first_path_slash = path.find('/')
    if first_path_slash == -1:
        return scheme_loc, None
    return scheme_loc, first_path_slash+len(scheme)+3


# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
def _posix_local_path_from_url(url):
    """Convert a url like file:///path/to/foo into /path/to/foo"""
    if not url.startswith('file:///'):
        raise errors.InvalidURL(url, 'local urls must start with file:///')
    # We only strip off 2 slashes
    return unescape(url[len('file://'):])


def _posix_local_path_to_url(path):
    """Convert a local path like ./foo into a URL like file:///path/to/foo

    This also handles transforming escaping unicode characters, etc.
    """
    # importing directly from posixpath allows us to test this 
    # on non-posix platforms
    from posixpath import normpath
    return 'file://' + escape(normpath(bzrlib.osutils._posix_abspath(path)))


def _win32_local_path_from_url(url):
    """Convert a url like file:///C|/path/to/foo into C:/path/to/foo"""
    if not url.startswith('file:///'):
        raise errors.InvalidURL(url, 'local urls must start with file:///')
    # We strip off all 3 slashes
    win32_url = url[len('file:///'):]
    if (win32_url[0] not in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
        or win32_url[1] not in  '|:'
        or win32_url[2] != '/'):
        raise errors.InvalidURL(url, 'Win32 file urls start with file:///X|/, where X is a valid drive letter')
    # TODO: jam 20060426, we could .upper() or .lower() the drive letter
    #       for better consistency.
    return win32_url[0].upper() + u':' + unescape(win32_url[2:])


def _win32_local_path_to_url(path):
    """Convert a local path like ./foo into a URL like file:///C|/path/to/foo

    This also handles transforming escaping unicode characters, etc.
    """
    # importing directly from ntpath allows us to test this 
    # on non-win32 platforms
    # TODO: jam 20060426 consider moving this import outside of the function
    win32_path = bzrlib.osutils._nt_normpath(
        bzrlib.osutils._win32_abspath(path)).replace('\\', '/')
    return 'file:///' + win32_path[0].upper() + '|' + escape(win32_path[2:])


local_path_to_url = _posix_local_path_to_url
local_path_from_url = _posix_local_path_from_url
MIN_ABS_FILEURL_LENGTH = len('file:///')

if sys.platform == 'win32':
    local_path_to_url = _win32_local_path_to_url
    local_path_from_url = _win32_local_path_from_url

    MIN_ABS_FILEURL_LENGTH = len('file:///C|/')


_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')


def normalize_url(url):
    """Make sure that a path string is in fully normalized URL form.
    
    This handles URLs which have unicode characters, spaces, 
    special characters, etc.

    It has two basic modes of operation, depending on whether the
    supplied string starts with a url specifier (scheme://) or not.
    If it does not have a specifier it is considered a local path,
    and will be converted into a file:/// url. Non-ascii characters
    will be encoded using utf-8.
    If it does have a url specifier, it will be treated as a "hybrid"
    URL. Basically, a URL that should have URL special characters already
    escaped (like +?&# etc), but may have unicode characters, etc
    which would not be valid in a real URL.

    :param url: Either a hybrid URL or a local path
    :return: A normalized URL which only includes 7-bit ASCII characters.
    """
    m = _url_scheme_re.match(url)
    if not m:
        return local_path_to_url(url)
    if not isinstance(url, unicode):
        # TODO: jam 20060510 We need to test for ascii characters that
        #       shouldn't be allowed in URLs
        for c in url:
            if c not in _url_safe_characters:
                raise errors.InvalidURL(url, 'URLs can only contain specific safe characters')
        return url
    # We have a unicode (hybrid) url
    scheme = m.group('scheme')
    path = list(m.group('path'))

    for i in xrange(len(path)):
        if path[i] not in _url_safe_characters:
            chars = path[i].encode('utf-8')
            path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
    return scheme + '://' + ''.join(path)


def split(url, exclude_trailing_slash=True):
    """Split a URL into its parent directory and a child directory.

    :param url: A relative or absolute URL
    :param exclude_trailing_slash: Strip off a final '/' if it is part
        of the path (but not if it is part of the protocol specification)
    """
    scheme_loc, first_path_slash = _find_scheme_and_separator(url)

    if first_path_slash is None:
        # We have either a relative path, or no separating slash
        if scheme_loc is None:
            # Relative path
            if exclude_trailing_slash and url.endswith('/'):
                url = url[:-1]
            return _posix_split(url)
        else:
            # Scheme with no path
            return url, ''

    # We have a fully defined path
    url_base = url[:first_path_slash] # http://host, file://
    path = url[first_path_slash:] # /file/foo

    if sys.platform == 'win32' and url.startswith('file:///'):
        # Strip off the drive letter
        if path[2:3] not in '\\/':
            raise errors.InvalidURL(url, 
                'win32 file:/// paths need a drive letter')
        url_base += path[1:4] # file:///C|/
        path = path[3:]

    if exclude_trailing_slash and len(path) > 1 and path.endswith('/'):
        path = path[:-1]
    head, tail = _posix_split(path)
    return url_base + head, tail


def strip_trailing_slash(url):
    """Strip trailing slash, except for root paths.

    The definition of 'root path' is platform-dependent.
    This assumes that all URLs are valid netloc urls, such that they
    form:
    scheme://host/path
    It searches for ://, and then refuses to remove the next '/'.
    It can also handle relative paths
    Examples:
        path/to/foo       => path/to/foo
        path/to/foo/      => path/to/foo
        http://host/path/ => http://host/path
        http://host/path  => http://host/path
        http://host/      => http://host/
        file:///          => file:///
        file:///foo/      => file:///foo
        # This is unique on win32 platforms, and is the only URL
        # format which does it differently.
        file:///C|/       => file:///C|/
    """
    if not url.endswith('/'):
        # Nothing to do
        return url
    if sys.platform == 'win32' and url.startswith('file:///'):
        # This gets handled specially, because the 'top-level'
        # of a win32 path is actually the drive letter
        if len(url) > MIN_ABS_FILEURL_LENGTH:
            return url[:-1]
        else:
            return url
    scheme_loc, first_path_slash = _find_scheme_and_separator(url)
    if scheme_loc is None:
        # This is a relative path, as it has no scheme
        # so just chop off the last character
        return url[:-1]

    if first_path_slash is None or first_path_slash == len(url)-1:
        # Don't chop off anything if the only slash is the path
        # separating slash
        return url

    return url[:-1]


def unescape(url):
    """Unescape relpath from url format.

    This returns a Unicode path from a URL
    """
    # jam 20060427 URLs are supposed to be ASCII only strings
    #       If they are passed in as unicode, urllib.unquote
    #       will return a UNICODE string, which actually contains
    #       utf-8 bytes. So we have to ensure that they are
    #       plain ASCII strings, or the final .decode will
    #       try to encode the UNICODE => ASCII, and then decode
    #       it into utf-8.
    try:
        url = str(url)
    except UnicodeError, e:
        raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
    unquoted = urllib.unquote(url)
    try:
        unicode_path = unquoted.decode('utf-8')
    except UnicodeError, e:
        raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
    return unicode_path


# These are characters that if escaped, should stay that way
_no_decode_chars = ';/?:@&=+$,#'
_no_decode_ords = [ord(c) for c in _no_decode_chars]
_no_decode_hex = (['%02x' % o for o in _no_decode_ords] 
                + ['%02X' % o for o in _no_decode_ords])
_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
                    + [('%02X' % o, chr(o)) for o in range(256)]))
#These entries get mapped to themselves
_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)

# These characters should not be escaped
_url_safe_characters = set('abcdefghijklmnopqrstuvwxyz'
                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                        '0123456789' '_.-/'
                        ';?:@&=+$,%#')


def unescape_for_display(url):
    """Decode what you can for a URL, so that we get a nice looking path.

    This will turn file:// urls into local paths, and try to decode
    any portions of a http:// style url that it can.
    """
    if url.startswith('file://'):
        return local_path_from_url(url)

    # Split into sections to try to decode utf-8
    res = url.split('/')
    for i in xrange(1, len(res)):
        escaped_chunks = res[i].split('%')
        for j in xrange(1, len(escaped_chunks)):
            item = escaped_chunks[j]
            try:
                escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
            except KeyError:
                # Put back the percent symbol
                escaped_chunks[j] = '%' + item
            except UnicodeDecodeError:
                escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
        unescaped = ''.join(escaped_chunks)
        try:
            res[i] = unescaped.decode('utf-8')
        except UnicodeDecodeError:
            # If this path segment cannot be properly utf-8 decoded
            # after doing unescaping we will just leave it alone
            pass
    return '/'.join(res)



1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	1	# Bazaar-NG -- distributed version control
	2	#
	3	# Copyright (C) 2006 by Canonical Ltd
	4	#
	5	# This program is free software; you can redistribute it and/or modify
	6	# it under the terms of the GNU General Public License as published by
	7	# the Free Software Foundation; either version 2 of the License, or
	8	# (at your option) any later version.
	9	#
	10	# This program is distributed in the hope that it will be useful,
	11	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	# GNU General Public License for more details.
	14	#
	15	# You should have received a copy of the GNU General Public License
	16	# along with this program; if not, write to the Free Software
	17	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	18
	19	"""A collection of function for handling URL operations."""
	20
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	21	import os
	22	from posixpath import split as _posix_split
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	23	import re
	24	import sys
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	25	import urllib
	26
	27	import bzrlib.errors as errors
	28	import bzrlib.osutils
	29
	30
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	31	def basename(url, exclude_trailing_slash=True):
	32	"""Return the last component of a URL.
	33
	34	:param url: The URL in question
	35	:param exclude_trailing_slash: If the url looks like "path/to/foo/"
	36	ignore the final slash and return 'foo' rather than ''
	37	:return: Just the final component of the URL. This can return ''
	38	if you don't exclude_trailing_slash, or if you are at the
	39	root of the URL.
	40	"""
	41	return split(url, exclude_trailing_slash=exclude_trailing_slash)[1]
	42
	43
	44	def dirname(url, exclude_trailing_slash=True):
	45	"""Return the parent directory of the given path.
	46
	47	:param url: Relative or absolute URL
	48	:param exclude_trailing_slash: Remove a final slash
	49	(treat http://host/foo/ as http://host/foo, but
	50	http://host/ stays http://host/)
	51	:return: Everything in the URL except the last path chunk
	52	"""
	53	# TODO: jam 20060502 This was named dirname to be consistent
	54	# with the os functions, but maybe "parent" would be better
	55	return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
	56
	57
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	58	def escape(relpath):
	59	"""Escape relpath to be a valid url."""
	60	if isinstance(relpath, unicode):
	61	relpath = relpath.encode('utf-8')
	62	# After quoting and encoding, the path should be perfectly
	63	# safe as a plain ASCII string, str() just enforces this
	64	return str(urllib.quote(relpath))
	65
	66
1685.1.46 by John Arbash Meinel Sorting functions by name.	67	def file_relpath(base, path):
	68	"""Compute just the relative sub-portion of a url
	69
	70	This assumes that both paths are already fully specified file:// URLs.
	71	"""
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	72	assert len(base) >= MIN_ABS_FILEURL_LENGTH, ('Length of base must be equal or'
1685.1.46 by John Arbash Meinel Sorting functions by name.	73	' exceed the platform minimum url length (which is %d)' %
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	74	MIN_ABS_FILEURL_LENGTH)
1685.1.46 by John Arbash Meinel Sorting functions by name.	75
	76	base = local_path_from_url(base)
	77	path = local_path_from_url(path)
	78	return escape(bzrlib.osutils.relpath(base, path))
	79
	80
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	81	def _find_scheme_and_separator(url):
	82	"""Find the scheme separator (://) and the first path separator
	83
	84	This is just a helper functions for other path utilities.
	85	It could probably be replaced by urlparse
	86	"""
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	87	m = _url_scheme_re.match(url)
	88	if not m:
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	89	return None, None
	90
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	91	scheme = m.group('scheme')
	92	path = m.group('path')
	93
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	94	# Find the path separating slash
	95	# (first slash after the ://)
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	96	first_path_slash = path.find('/')
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	97	if first_path_slash == -1:
	98	return scheme_loc, None
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	99	return scheme_loc, first_path_slash+len(scheme)+3
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	100
	101
1685.1.46 by John Arbash Meinel Sorting functions by name.	102	# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
	103	def _posix_local_path_from_url(url):
	104	"""Convert a url like file:///path/to/foo into /path/to/foo"""
	105	if not url.startswith('file:///'):
	106	raise errors.InvalidURL(url, 'local urls must start with file:///')
	107	# We only strip off 2 slashes
	108	return unescape(url[len('file://'):])
	109
	110
	111	def _posix_local_path_to_url(path):
	112	"""Convert a local path like ./foo into a URL like file:///path/to/foo
	113
	114	This also handles transforming escaping unicode characters, etc.
	115	"""
	116	# importing directly from posixpath allows us to test this
	117	# on non-posix platforms
	118	from posixpath import normpath
	119	return 'file://' + escape(normpath(bzrlib.osutils._posix_abspath(path)))
	120
	121
	122	def _win32_local_path_from_url(url):
	123	"""Convert a url like file:///C\|/path/to/foo into C:/path/to/foo"""
	124	if not url.startswith('file:///'):
	125	raise errors.InvalidURL(url, 'local urls must start with file:///')
	126	# We strip off all 3 slashes
	127	win32_url = url[len('file:///'):]
	128	if (win32_url[0] not in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
	129	or win32_url[1] not in '\|:'
	130	or win32_url[2] != '/'):
	131	raise errors.InvalidURL(url, 'Win32 file urls start with file:///X\|/, where X is a valid drive letter')
	132	# TODO: jam 20060426, we could .upper() or .lower() the drive letter
	133	# for better consistency.
	134	return win32_url[0].upper() + u':' + unescape(win32_url[2:])
	135
	136
	137	def _win32_local_path_to_url(path):
	138	"""Convert a local path like ./foo into a URL like file:///C\|/path/to/foo
	139
	140	This also handles transforming escaping unicode characters, etc.
	141	"""
	142	# importing directly from ntpath allows us to test this
	143	# on non-win32 platforms
	144	# TODO: jam 20060426 consider moving this import outside of the function
	145	win32_path = bzrlib.osutils._nt_normpath(
	146	bzrlib.osutils._win32_abspath(path)).replace('\\', '/')
	147	return 'file:///' + win32_path[0].upper() + '\|' + escape(win32_path[2:])
	148
	149
	150	local_path_to_url = _posix_local_path_to_url
	151	local_path_from_url = _posix_local_path_from_url
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	152	MIN_ABS_FILEURL_LENGTH = len('file:///')
1685.1.46 by John Arbash Meinel Sorting functions by name.	153
	154	if sys.platform == 'win32':
	155	local_path_to_url = _win32_local_path_to_url
	156	local_path_from_url = _win32_local_path_from_url
	157
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	158	MIN_ABS_FILEURL_LENGTH = len('file:///C\|/')
	159
	160
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	161	_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,})://(?P<path>.*)$')
	162
	163
	164	def normalize_url(url):
	165	"""Make sure that a path string is in fully normalized URL form.
	166
	167	This handles URLs which have unicode characters, spaces,
	168	special characters, etc.
	169
	170	It has two basic modes of operation, depending on whether the
	171	supplied string starts with a url specifier (scheme://) or not.
	172	If it does not have a specifier it is considered a local path,
	173	and will be converted into a file:/// url. Non-ascii characters
	174	will be encoded using utf-8.
	175	If it does have a url specifier, it will be treated as a "hybrid"
	176	URL. Basically, a URL that should have URL special characters already
	177	escaped (like +?&# etc), but may have unicode characters, etc
	178	which would not be valid in a real URL.
	179
	180	:param url: Either a hybrid URL or a local path
	181	:return: A normalized URL which only includes 7-bit ASCII characters.
	182	"""
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	183	m = _url_scheme_re.match(url)
	184	if not m:
	185	return local_path_to_url(url)
	186	if not isinstance(url, unicode):
	187	# TODO: jam 20060510 We need to test for ascii characters that
	188	# shouldn't be allowed in URLs
	189	for c in url:
	190	if c not in _url_safe_characters:
	191	raise errors.InvalidURL(url, 'URLs can only contain specific safe characters')
	192	return url
	193	# We have a unicode (hybrid) url
	194	scheme = m.group('scheme')
	195	path = list(m.group('path'))
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	196
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	197	for i in xrange(len(path)):
	198	if path[i] not in _url_safe_characters:
	199	chars = path[i].encode('utf-8')
	200	path[i] = ''.join(['%%%02X' % ord(c) for c in path[i].encode('utf-8')])
	201	return scheme + '://' + ''.join(path)
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	202
	203
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	204	def split(url, exclude_trailing_slash=True):
	205	"""Split a URL into its parent directory and a child directory.
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	206
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	207	:param url: A relative or absolute URL
	208	:param exclude_trailing_slash: Strip off a final '/' if it is part
	209	of the path (but not if it is part of the protocol specification)
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	210	"""
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	211	scheme_loc, first_path_slash = _find_scheme_and_separator(url)
	212
	213	if first_path_slash is None:
	214	# We have either a relative path, or no separating slash
	215	if scheme_loc is None:
	216	# Relative path
	217	if exclude_trailing_slash and url.endswith('/'):
	218	url = url[:-1]
	219	return _posix_split(url)
	220	else:
	221	# Scheme with no path
	222	return url, ''
	223
	224	# We have a fully defined path
	225	url_base = url[:first_path_slash] # http://host, file://
	226	path = url[first_path_slash:] # /file/foo
	227
	228	if sys.platform == 'win32' and url.startswith('file:///'):
	229	# Strip off the drive letter
	230	if path[2:3] not in '\\/':
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	231	raise errors.InvalidURL(url,
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	232	'win32 file:/// paths need a drive letter')
	233	url_base += path[1:4] # file:///C\|/
	234	path = path[3:]
	235
	236	if exclude_trailing_slash and len(path) > 1 and path.endswith('/'):
	237	path = path[:-1]
	238	head, tail = _posix_split(path)
	239	return url_base + head, tail
	240
1685.1.46 by John Arbash Meinel Sorting functions by name.	241
1685.1.47 by John Arbash Meinel s comes before u	242	def strip_trailing_slash(url):
	243	"""Strip trailing slash, except for root paths.
	244
	245	The definition of 'root path' is platform-dependent.
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	246	This assumes that all URLs are valid netloc urls, such that they
	247	form:
	248	scheme://host/path
	249	It searches for ://, and then refuses to remove the next '/'.
	250	It can also handle relative paths
	251	Examples:
	252	path/to/foo => path/to/foo
	253	path/to/foo/ => path/to/foo
	254	http://host/path/ => http://host/path
	255	http://host/path => http://host/path
	256	http://host/ => http://host/
	257	file:/// => file:///
	258	file:///foo/ => file:///foo
	259	# This is unique on win32 platforms, and is the only URL
	260	# format which does it differently.
	261	file:///C\|/ => file:///C\|/
1685.1.47 by John Arbash Meinel s comes before u	262	"""
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	263	if not url.endswith('/'):
	264	# Nothing to do
	265	return url
	266	if sys.platform == 'win32' and url.startswith('file:///'):
	267	# This gets handled specially, because the 'top-level'
	268	# of a win32 path is actually the drive letter
	269	if len(url) > MIN_ABS_FILEURL_LENGTH:
	270	return url[:-1]
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	271	else:
	272	return url
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	273	scheme_loc, first_path_slash = _find_scheme_and_separator(url)
	274	if scheme_loc is None:
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	275	# This is a relative path, as it has no scheme
	276	# so just chop off the last character
1685.1.47 by John Arbash Meinel s comes before u	277	return url[:-1]
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	278
1685.1.49 by John Arbash Meinel Added bzrlib.urlutils.split and basename + dirname	279	if first_path_slash is None or first_path_slash == len(url)-1:
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	280	# Don't chop off anything if the only slash is the path
	281	# separating slash
1685.1.47 by John Arbash Meinel s comes before u	282	return url
1685.1.47 by John Arbash Meinel s comes before u	283
1685.1.48 by John Arbash Meinel Updated strip_trailing_slash to support lots more url stuff, added tests	284	return url[:-1]
	285
1685.1.47 by John Arbash Meinel s comes before u	286
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	287	def unescape(url):
	288	"""Unescape relpath from url format.
	289
	290	This returns a Unicode path from a URL
	291	"""
	292	# jam 20060427 URLs are supposed to be ASCII only strings
	293	# If they are passed in as unicode, urllib.unquote
	294	# will return a UNICODE string, which actually contains
	295	# utf-8 bytes. So we have to ensure that they are
	296	# plain ASCII strings, or the final .decode will
	297	# try to encode the UNICODE => ASCII, and then decode
	298	# it into utf-8.
	299	try:
	300	url = str(url)
	301	except UnicodeError, e:
	302	raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
	303	unquoted = urllib.unquote(url)
	304	try:
	305	unicode_path = unquoted.decode('utf-8')
	306	except UnicodeError, e:
	307	raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
	308	return unicode_path
	309
	310
	311	# These are characters that if escaped, should stay that way
	312	_no_decode_chars = ';/?:@&=+$,#'
	313	_no_decode_ords = [ord(c) for c in _no_decode_chars]
	314	_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
	315	+ ['%02X' % o for o in _no_decode_ords])
1685.1.50 by John Arbash Meinel Added an re for handling scheme paths.	316	_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
	317	+ [('%02X' % o, chr(o)) for o in range(256)]))
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	318	#These entries get mapped to themselves
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	319	_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
1685.1.51 by John Arbash Meinel Working on getting normalize_url working.	320
	321	# These characters should not be escaped
	322	_url_safe_characters = set('abcdefghijklmnopqrstuvwxyz'
	323	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	324	'0123456789' '_.-/'
	325	';?:@&=+$,%#')
1685.1.45 by John Arbash Meinel Moved url functions into bzrlib.urlutils	326
	327
	328	def unescape_for_display(url):
	329	"""Decode what you can for a URL, so that we get a nice looking path.
	330
	331	This will turn file:// urls into local paths, and try to decode
	332	any portions of a http:// style url that it can.
	333	"""
	334	if url.startswith('file://'):
	335	return local_path_from_url(url)
	336
	337	# Split into sections to try to decode utf-8
	338	res = url.split('/')
	339	for i in xrange(1, len(res)):
	340	escaped_chunks = res[i].split('%')
	341	for j in xrange(1, len(escaped_chunks)):
	342	item = escaped_chunks[j]
	343	try:
	344	escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
	345	except KeyError:
	346	# Put back the percent symbol
	347	escaped_chunks[j] = '%' + item
	348	except UnicodeDecodeError:
	349	escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
	350	unescaped = ''.join(escaped_chunks)
	351	try:
	352	res[i] = unescaped.decode('utf-8')
	353	except UnicodeDecodeError:
	354	# If this path segment cannot be properly utf-8 decoded
	355	# after doing unescaping we will just leave it alone
	356	pass
	357	return '/'.join(res)
	358
1685.1.46 by John Arbash Meinel Sorting functions by name.	359