68
70
if password is not None:
69
71
password = urllib.unquote(password)
71
password = ui_factory.get_password(prompt='HTTP %(user)@%(host) password',
72
user=username, host=host)
73
password = ui.ui_factory.get_password(
74
prompt='HTTP %(user)s@%(host)s password',
75
user=username, host=host)
73
76
password_manager.add_password(None, host, username, password)
74
77
url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
78
def _extract_headers(header_text, url):
79
"""Extract the mapping for an rfc2822 header
81
This is a helper function for the test suite and for _pycurl.
82
(urllib already parses the headers for us)
84
In the case that there are multiple headers inside the file,
85
the last one is returned.
87
:param header_text: A string of header information.
88
This expects that the first line of a header will always be HTTP ...
89
:param url: The url we are parsing, so we can raise nice errors
90
:return: mimetools.Message object, which basically acts like a case
91
insensitive dictionary.
94
remaining = header_text
97
raise errors.InvalidHttpResponse(url, 'Empty headers')
100
header_file = StringIO(remaining)
101
first_line = header_file.readline()
102
if not first_line.startswith('HTTP'):
103
if first_header: # The first header *must* start with HTTP
104
raise errors.InvalidHttpResponse(url,
105
'Opening header line did not start with HTTP: %s'
107
assert False, 'Opening header line was not HTTP'
109
break # We are done parsing
111
m = mimetools.Message(header_file)
113
# mimetools.Message parses the first header up to a blank line
114
# So while there is remaining data, it probably means there is
115
# another header to be parsed.
116
# Get rid of any preceeding whitespace, which if it is all whitespace
117
# will get rid of everything.
118
remaining = header_file.read().lstrip()
122
class HttpTransportBase(Transport):
81
class HttpTransportBase(ConnectedTransport, medium.SmartClientMedium):
123
82
"""Base class for http implementations.
125
84
Does URL parsing, etc, but not any network IO.
131
# _proto: "http" or "https"
132
# _qualified_proto: may have "+pycurl", etc
90
# _unqualified_scheme: "http" or "https"
91
# _scheme: may have "+pycurl", etc
134
def __init__(self, base):
93
def __init__(self, base, _from_transport=None):
135
94
"""Set the base path where files will be stored."""
136
95
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
137
96
if not proto_match:
138
97
raise AssertionError("not a http url: %r" % base)
139
self._proto = proto_match.group(1)
98
self._unqualified_scheme = proto_match.group(1)
140
99
impl_name = proto_match.group(2)
142
101
impl_name = impl_name[1:]
143
102
self._impl_name = impl_name
146
super(HttpTransportBase, self).__init__(base)
147
# In the future we might actually connect to the remote host
148
# rather than using get_url
149
# self._connection = None
150
(apparent_proto, self._host,
151
self._path, self._parameters,
152
self._query, self._fragment) = urlparse.urlparse(self.base)
153
self._qualified_proto = apparent_proto
155
def abspath(self, relpath):
156
"""Return the full url to the given relative path.
158
This can be supplied with a string or a list.
160
The URL returned always has the protocol scheme originally used to
161
construct the transport, even if that includes an explicit
162
implementation qualifier.
164
assert isinstance(relpath, basestring)
165
if isinstance(relpath, unicode):
166
raise InvalidURL(relpath, 'paths must not be unicode.')
167
if isinstance(relpath, basestring):
168
relpath_parts = relpath.split('/')
103
super(HttpTransportBase, self).__init__(base,
104
_from_transport=_from_transport)
105
# range hint is handled dynamically throughout the life
106
# of the transport object. We start by trying multi-range
107
# requests and if the server returns bogus results, we
108
# retry with single range requests and, finally, we
109
# forget about range if the server really can't
110
# understand. Once acquired, this piece of info is
111
# propagated to clones.
112
if _from_transport is not None:
113
self._range_hint = _from_transport._range_hint
170
# TODO: Don't call this with an array - no magic interfaces
171
relpath_parts = relpath[:]
172
if len(relpath_parts) > 1:
173
# TODO: Check that the "within branch" part of the
174
# error messages below is relevant in all contexts
175
if relpath_parts[0] == '':
176
raise ValueError("path %r within branch %r seems to be absolute"
177
% (relpath, self._path))
178
# read only transports never manipulate directories
179
if self.is_readonly() and relpath_parts[-1] == '':
180
raise ValueError("path %r within branch %r seems to be a directory"
181
% (relpath, self._path))
182
basepath = self._path.split('/')
183
if len(basepath) > 0 and basepath[-1] == '':
184
basepath = basepath[:-1]
185
for p in relpath_parts:
187
if len(basepath) == 0:
188
# In most filesystems, a request for the parent
189
# of root, just returns root.
192
elif p == '.' or p == '':
196
# Possibly, we could use urlparse.urljoin() here, but
197
# I'm concerned about when it chooses to strip the last
198
# portion of the path, and when it doesn't.
199
path = '/'.join(basepath)
202
result = urlparse.urlunparse((self._qualified_proto,
203
self._host, path, '', '', ''))
206
def _real_abspath(self, relpath):
207
"""Produce absolute path, adjusting protocol if needed"""
208
abspath = self.abspath(relpath)
209
qp = self._qualified_proto
211
if self._qualified_proto != self._proto:
212
abspath = rp + abspath[len(qp):]
213
if not isinstance(abspath, str):
214
# escaping must be done at a higher level
215
abspath = abspath.encode('ascii')
115
self._range_hint = 'multi'
218
117
def has(self, relpath):
219
118
raise NotImplementedError("has() is abstract on %r" % self)
224
123
:param relpath: The relative path to the file
226
125
code, response_file = self._get(relpath, None)
126
# FIXME: some callers want an iterable... One step forward, three steps
127
# backwards :-/ And not only an iterable, but an iterable that can be
128
# seeked backwards, so we will never be able to do that. One such
129
# known client is bzrlib.bundle.serializer.v4.get_bundle_reader. At the
130
# time of this writing it's even the only known client -- vila20071203
131
return StringIO(response_file.read())
229
def _get(self, relpath, ranges):
133
def _get(self, relpath, ranges, tail_amount=0):
230
134
"""Get a file, or part of a file.
232
136
:param relpath: Path relative to transport base URL
233
:param byte_range: None to get the whole file;
234
or [(start,end)] to fetch parts of a file.
137
:param ranges: None to get the whole file;
138
or a list of _CoalescedOffset to fetch parts of a file.
139
:param tail_amount: The amount to get from the end of the file.
236
141
:returns: (http_code, result_file)
238
Note that the current http implementations can only fetch one range at
239
a time through this call.
241
143
raise NotImplementedError(self._get)
243
def readv(self, relpath, offsets):
145
def _remote_path(self, relpath):
146
"""See ConnectedTransport._remote_path.
148
user and passwords are not embedded in the path provided to the server.
150
relative = urlutils.unescape(relpath).encode('utf-8')
151
path = self._combine_paths(self._path, relative)
152
return self._unsplit_url(self._unqualified_scheme,
153
None, None, self._host, self._port, path)
155
def _create_auth(self):
156
"""Returns a dict returning the credentials provided at build time."""
157
auth = dict(host=self._host, port=self._port,
158
user=self._user, password=self._password,
159
protocol=self._unqualified_scheme,
163
def get_request(self):
164
return SmartClientHTTPMediumRequest(self)
166
def get_smart_medium(self):
167
"""See Transport.get_smart_medium.
169
HttpTransportBase directly implements the minimal interface of
170
SmartMediumClient, so this returns self.
174
def _degrade_range_hint(self, relpath, ranges, exc_info):
175
if self._range_hint == 'multi':
176
self._range_hint = 'single'
177
mutter('Retry "%s" with single range request' % relpath)
178
elif self._range_hint == 'single':
179
self._range_hint = None
180
mutter('Retry "%s" without ranges' % relpath)
182
# We tried all the tricks, but nothing worked. We re-raise the
183
# original exception; the 'mutter' calls above will indicate that
184
# further tries were unsuccessful
185
raise exc_info[0], exc_info[1], exc_info[2]
187
# _coalesce_offsets is a helper for readv, it try to combine ranges without
188
# degrading readv performances. _bytes_to_read_before_seek is the value
189
# used for the limit parameter and has been tuned for other transports. For
190
# HTTP, the name is inappropriate but the parameter is still useful and
191
# helps reduce the number of chunks in the response. The overhead for a
192
# chunk (headers, length, footer around the data itself is variable but
193
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
194
# the header, some servers (notably Apache) enforce a maximum length for a
195
# header and issue a '400: Bad request' error when too much ranges are
197
_bytes_to_read_before_seek = 128
198
# No limit on the offset number that get combined into one, we are trying
199
# to avoid downloading the whole file.
200
_max_readv_combine = 0
201
# By default Apache has a limit of ~400 ranges before replying with a 400
202
# Bad Request. So we go underneath that amount to be safe.
203
_max_get_ranges = 200
204
# We impose no limit on the range size. But see _pycurl.py for a different
208
def _readv(self, relpath, offsets):
244
209
"""Get parts of the file at the given relative path.
246
211
:param offsets: A list of (offset, size) tuples.
247
212
:param return: A list or generator of (offset, data) tuples
249
ranges = self.offsets_to_ranges(offsets)
250
mutter('http readv of %s collapsed %s offsets => %s',
251
relpath, len(offsets), ranges)
252
code, f = self._get(relpath, ranges)
253
for start, size in offsets:
254
f.seek(start, (start < 0) and 2 or 0)
257
assert len(data) == size
261
def offsets_to_ranges(offsets):
262
"""Turn a list of offsets and sizes into a list of byte ranges.
264
:param offsets: A list of tuples of (start, size). An empty list
266
:return: a list of inclusive byte ranges (start, end)
267
Adjacent ranges will be combined.
269
# Make sure we process sorted offsets
270
offsets = sorted(offsets)
275
for start, size in offsets:
276
end = start + size - 1
278
combined.append([start, end])
279
elif start <= prev_end + 1:
280
combined[-1][1] = end
215
# offsets may be a generator, we will iterate it several times, so
217
offsets = list(offsets)
220
retried_offset = None
224
# Coalesce the offsets to minimize the GET requests issued
225
sorted_offsets = sorted(offsets)
226
coalesced = self._coalesce_offsets(
227
sorted_offsets, limit=self._max_readv_combine,
228
fudge_factor=self._bytes_to_read_before_seek,
229
max_size=self._get_max_size)
231
# Turn it into a list, we will iterate it several times
232
coalesced = list(coalesced)
233
mutter('http readv of %s offsets => %s collapsed %s',
234
relpath, len(offsets), len(coalesced))
236
# Cache the data read, but only until it's been used
238
# We will iterate on the data received from the GET requests and
239
# serve the corresponding offsets respecting the initial order. We
240
# need an offset iterator for that.
241
iter_offsets = iter(offsets)
242
cur_offset_and_size = iter_offsets.next()
245
for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
246
# Split the received chunk
247
for offset, size in cur_coal.ranges:
248
start = cur_coal.start + offset
250
data = rfile.read(size)
253
raise errors.ShortReadvError(relpath, start, size,
255
if (start, size) == cur_offset_and_size:
256
# The offset requested are sorted as the coalesced
257
# ones, no need to cache. Win !
258
yield cur_offset_and_size[0], data
259
cur_offset_and_size = iter_offsets.next()
261
# Different sorting. We need to cache.
262
data_map[(start, size)] = data
264
# Yield everything we can
265
while cur_offset_and_size in data_map:
266
# Clean the cached data since we use it
267
# XXX: will break if offsets contains duplicates --
269
this_data = data_map.pop(cur_offset_and_size)
270
yield cur_offset_and_size[0], this_data
271
cur_offset_and_size = iter_offsets.next()
273
except (errors.ShortReadvError, errors.InvalidRange,
274
errors.InvalidHttpRange), e:
275
mutter('Exception %r: %s during http._readv',e, e)
276
if (not isinstance(e, errors.ShortReadvError)
277
or retried_offset == cur_offset_and_size):
278
# We don't degrade the range hint for ShortReadvError since
279
# they do not indicate a problem with the server ability to
280
# handle ranges. Except when we fail to get back a required
281
# offset twice in a row. In that case, falling back to
282
# single range or whole file should help or end up in a
284
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
285
# Some offsets may have been already processed, so we retry
286
# only the unsuccessful ones.
287
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
288
retried_offset = cur_offset_and_size
291
def _coalesce_readv(self, relpath, coalesced):
292
"""Issue several GET requests to satisfy the coalesced offsets"""
294
def get_and_yield(relpath, coalesced):
296
# Note that the _get below may raise
297
# errors.InvalidHttpRange. It's the caller's responsibility to
298
# decide how to retry since it may provide different coalesced
300
code, rfile = self._get(relpath, coalesced)
301
for coal in coalesced:
304
if self._range_hint is None:
305
# Download whole file
306
for c, rfile in get_and_yield(relpath, coalesced):
309
total = len(coalesced)
310
if self._range_hint == 'multi':
311
max_ranges = self._max_get_ranges
312
elif self._range_hint == 'single':
282
combined.append([start, end])
315
raise AssertionError("Unknown _range_hint %r"
316
% (self._range_hint,))
317
# TODO: Some web servers may ignore the range requests and return
318
# the whole file, we may want to detect that and avoid further
320
# Hint: test_readv_multiple_get_requests will fail once we do that
323
for coal in coalesced:
324
if ((self._get_max_size > 0
325
and cumul + coal.length > self._get_max_size)
326
or len(ranges) >= max_ranges):
327
# Get that much and yield
328
for c, rfile in get_and_yield(relpath, ranges):
330
# Restart with the current offset
336
# Get the rest and yield
337
for c, rfile in get_and_yield(relpath, ranges):
340
def recommended_page_size(self):
341
"""See Transport.recommended_page_size().
343
For HTTP we suggest a large page size to reduce the overhead
344
introduced by latency.
348
def _post(self, body_bytes):
349
"""POST body_bytes to .bzr/smart on this transport.
351
:returns: (response code, response body file-like object).
353
# TODO: Requiring all the body_bytes to be available at the beginning of
354
# the POST may require large client buffers. It would be nice to have
355
# an interface that allows streaming via POST when possible (and
356
# degrades to a local buffer when not).
357
raise NotImplementedError(self._post)
287
359
def put_file(self, relpath, f, mode=None):
288
360
"""Copy the file-like object into the location.
368
446
:return: A lock object, which should be passed to Transport.unlock()
370
raise TransportNotPossible('http does not support lock_write()')
448
raise errors.TransportNotPossible('http does not support lock_write()')
372
450
def clone(self, offset=None):
373
451
"""Return a new HttpTransportBase with root at self.base + offset
374
For now HttpTransportBase does not actually connect, so just return
375
a new HttpTransportBase object.
453
We leave the daughter classes take advantage of the hint
454
that it's a cloning not a raw creation.
377
456
if offset is None:
378
return self.__class__(self.base)
380
return self.__class__(self.abspath(offset))
457
return self.__class__(self.base, self)
459
return self.__class__(self.abspath(offset), self)
461
def _attempted_range_header(self, offsets, tail_amount):
462
"""Prepare a HTTP Range header at a level the server should accept.
464
:return: the range header representing offsets/tail_amount or None if
465
no header can be built.
468
if self._range_hint == 'multi':
469
# Generate the header describing all offsets
470
return self._range_header(offsets, tail_amount)
471
elif self._range_hint == 'single':
472
# Combine all the requested ranges into a single
475
if tail_amount not in (0, None):
476
# Nothing we can do here to combine ranges with tail_amount
477
# in a single range, just returns None. The whole file
478
# should be downloaded.
481
start = offsets[0].start
483
end = last.start + last.length - 1
484
whole = self._coalesce_offsets([(start, end - start + 1)],
485
limit=0, fudge_factor=0)
486
return self._range_header(list(whole), 0)
488
# Only tail_amount, requested, leave range_header
490
return self._range_header(offsets, tail_amount)
383
def range_header(ranges, tail_amount):
495
def _range_header(ranges, tail_amount):
384
496
"""Turn a list of bytes ranges into a HTTP Range header value.
386
:param offsets: A list of byte ranges, (start, end). An empty list
498
:param ranges: A list of _CoalescedOffset
499
:param tail_amount: The amount to get from the end of the file.
389
501
:return: HTTP range header string.
503
At least a non-empty ranges *or* a tail_amount must be
392
for start, end in ranges:
393
strings.append('%d-%d' % (start, end))
507
for offset in ranges:
508
strings.append('%d-%d' % (offset.start,
509
offset.start + offset.length - 1))
396
512
strings.append('-%d' % tail_amount)
398
514
return ','.join(strings)
401
#---------------- test server facilities ----------------
402
# TODO: load these only when running tests
405
class WebserverNotAvailable(Exception):
409
class BadWebserverPath(ValueError):
411
return 'path %s is not in %s' % self.args
414
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
416
def log_message(self, format, *args):
417
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
418
self.address_string(),
419
self.log_date_time_string(),
421
self.headers.get('referer', '-'),
422
self.headers.get('user-agent', '-'))
424
def handle_one_request(self):
425
"""Handle a single HTTP request.
427
You normally don't need to override this method; see the class
428
__doc__ string for information on how to handle specific HTTP
429
commands such as GET and POST.
432
for i in xrange(1,11): # Don't try more than 10 times
434
self.raw_requestline = self.rfile.readline()
435
except socket.error, e:
436
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
437
# omitted for now because some tests look at the log of
438
# the server and expect to see no errors. see recent
439
# email thread. -- mbp 20051021.
440
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
446
if not self.raw_requestline:
447
self.close_connection = 1
449
if not self.parse_request(): # An error code has been sent, just exit
451
mname = 'do_' + self.command
452
if getattr(self, mname, None) is None:
453
self.send_error(501, "Unsupported method (%r)" % self.command)
455
method = getattr(self, mname)
458
if sys.platform == 'win32':
459
# On win32 you cannot access non-ascii filenames without
460
# decoding them into unicode first.
461
# However, under Linux, you can access bytestream paths
462
# without any problems. If this function was always active
463
# it would probably break tests when LANG=C was set
464
def translate_path(self, path):
465
"""Translate a /-separated PATH to the local filename syntax.
467
For bzr, all url paths are considered to be utf8 paths.
468
On Linux, you can access these paths directly over the bytestream
469
request, but on win32, you must decode them, and access them
472
# abandon query parameters
473
path = urlparse.urlparse(path)[2]
474
path = posixpath.normpath(urllib.unquote(path))
475
path = path.decode('utf-8')
476
words = path.split('/')
477
words = filter(None, words)
480
drive, word = os.path.splitdrive(word)
481
head, word = os.path.split(word)
482
if word in (os.curdir, os.pardir): continue
483
path = os.path.join(path, word)
487
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
488
def __init__(self, server_address, RequestHandlerClass, test_case):
489
BaseHTTPServer.HTTPServer.__init__(self, server_address,
491
self.test_case = test_case
494
class HttpServer(Server):
495
"""A test server for http transports."""
497
# used to form the url that connects to this server
498
_url_protocol = 'http'
500
# Subclasses can provide a specific request handler
501
def __init__(self, request_handler=TestingHTTPRequestHandler):
502
Server.__init__(self)
503
self.request_handler = request_handler
505
def _http_start(self):
507
httpd = TestingHTTPServer(('localhost', 0),
508
self.request_handler,
510
host, port = httpd.socket.getsockname()
511
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
512
self._http_starting.release()
513
httpd.socket.settimeout(0.1)
515
while self._http_running:
517
httpd.handle_request()
518
except socket.timeout:
521
def _get_remote_url(self, path):
522
path_parts = path.split(os.path.sep)
523
if os.path.isabs(path):
524
if path_parts[:len(self._local_path_parts)] != \
525
self._local_path_parts:
526
raise BadWebserverPath(path, self.test_dir)
527
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
529
remote_path = '/'.join(path_parts)
531
self._http_starting.acquire()
532
self._http_starting.release()
533
return self._http_base_url + remote_path
535
def log(self, format, *args):
536
"""Capture Server log output."""
537
self.logs.append(format % args)
540
"""See bzrlib.transport.Server.setUp."""
541
self._home_dir = os.getcwdu()
542
self._local_path_parts = self._home_dir.split(os.path.sep)
543
self._http_starting = threading.Lock()
544
self._http_starting.acquire()
545
self._http_running = True
546
self._http_base_url = None
547
self._http_thread = threading.Thread(target=self._http_start)
548
self._http_thread.setDaemon(True)
549
self._http_thread.start()
550
self._http_proxy = os.environ.get("http_proxy")
551
if self._http_proxy is not None:
552
del os.environ["http_proxy"]
556
"""See bzrlib.transport.Server.tearDown."""
557
self._http_running = False
558
self._http_thread.join()
559
if self._http_proxy is not None:
561
os.environ["http_proxy"] = self._http_proxy
564
"""See bzrlib.transport.Server.get_url."""
565
return self._get_remote_url(self._home_dir)
567
def get_bogus_url(self):
568
"""See bzrlib.transport.Server.get_bogus_url."""
569
# this is chosen to try to prevent trouble with proxies, weird dns,
571
return 'http://127.0.0.1:1/'
516
def send_http_smart_request(self, bytes):
518
code, body_filelike = self._post(bytes)
520
raise InvalidHttpResponse(
521
self._remote_path('.bzr/smart'),
522
'Expected 200 response code, got %r' % (code,))
523
except errors.InvalidHttpResponse, e:
524
raise errors.SmartProtocolError(str(e))
527
def should_probe(self):
530
def remote_path_from_transport(self, transport):
531
# Strip the optional 'bzr+' prefix from transport so it will have the
532
# same scheme as self.
533
transport_base = transport.base
534
if transport_base.startswith('bzr+'):
535
transport_base = transport_base[4:]
536
rel_url = urlutils.relative_url(self.base, transport_base)
537
return urllib.unquote(rel_url)
540
class SmartClientHTTPMediumRequest(medium.SmartClientMediumRequest):
541
"""A SmartClientMediumRequest that works with an HTTP medium."""
543
def __init__(self, client_medium):
544
medium.SmartClientMediumRequest.__init__(self, client_medium)
547
def _accept_bytes(self, bytes):
548
self._buffer += bytes
550
def _finished_writing(self):
551
data = self._medium.send_http_smart_request(self._buffer)
552
self._response_body = data
554
def _read_bytes(self, count):
555
"""See SmartClientMediumRequest._read_bytes."""
556
return self._response_body.read(count)
558
def _read_line(self):
559
line, excess = medium._get_line(self._response_body.read)
561
raise AssertionError(
562
'_get_line returned excess bytes, but this mediumrequest '
563
'cannot handle excess. (%r)' % (excess,))
566
def _finished_reading(self):
567
"""See SmartClientMediumRequest._finished_reading."""