81
# _proto: "http" or "https"
82
# _qualified_proto: may have "+pycurl", etc
90
# _unqualified_scheme: "http" or "https"
91
# _scheme: may have "+pycurl", etc
84
def __init__(self, base):
93
def __init__(self, base, _from_transport=None):
85
94
"""Set the base path where files will be stored."""
86
95
proto_match = re.match(r'^(https?)(\+\w+)?://', base)
87
96
if not proto_match:
88
97
raise AssertionError("not a http url: %r" % base)
89
self._proto = proto_match.group(1)
98
self._unqualified_scheme = proto_match.group(1)
90
99
impl_name = proto_match.group(2)
92
101
impl_name = impl_name[1:]
93
102
self._impl_name = impl_name
96
super(HttpTransportBase, self).__init__(base)
97
# In the future we might actually connect to the remote host
98
# rather than using get_url
99
# self._connection = None
100
(apparent_proto, self._host,
101
self._path, self._parameters,
102
self._query, self._fragment) = urlparse.urlparse(self.base)
103
self._qualified_proto = apparent_proto
105
def abspath(self, relpath):
106
"""Return the full url to the given relative path.
108
This can be supplied with a string or a list.
110
The URL returned always has the protocol scheme originally used to
111
construct the transport, even if that includes an explicit
112
implementation qualifier.
114
assert isinstance(relpath, basestring)
115
if isinstance(relpath, unicode):
116
raise InvalidURL(relpath, 'paths must not be unicode.')
117
if isinstance(relpath, basestring):
118
relpath_parts = relpath.split('/')
103
super(HttpTransportBase, self).__init__(base,
104
_from_transport=_from_transport)
105
# range hint is handled dynamically throughout the life
106
# of the transport object. We start by trying multi-range
107
# requests and if the server returns bogus results, we
108
# retry with single range requests and, finally, we
109
# forget about range if the server really can't
110
# understand. Once acquired, this piece of info is
111
# propagated to clones.
112
if _from_transport is not None:
113
self._range_hint = _from_transport._range_hint
120
# TODO: Don't call this with an array - no magic interfaces
121
relpath_parts = relpath[:]
122
if len(relpath_parts) > 1:
123
if relpath_parts[0] == '':
124
raise ValueError("path %r within branch %r seems to be absolute"
125
% (relpath, self._path))
126
if relpath_parts[-1] == '':
127
raise ValueError("path %r within branch %r seems to be a directory"
128
% (relpath, self._path))
129
basepath = self._path.split('/')
130
if len(basepath) > 0 and basepath[-1] == '':
131
basepath = basepath[:-1]
132
for p in relpath_parts:
134
if len(basepath) == 0:
135
# In most filesystems, a request for the parent
136
# of root, just returns root.
139
elif p == '.' or p == '':
143
# Possibly, we could use urlparse.urljoin() here, but
144
# I'm concerned about when it chooses to strip the last
145
# portion of the path, and when it doesn't.
146
path = '/'.join(basepath)
149
result = urlparse.urlunparse((self._qualified_proto,
150
self._host, path, '', '', ''))
153
def _real_abspath(self, relpath):
154
"""Produce absolute path, adjusting protocol if needed"""
155
abspath = self.abspath(relpath)
156
qp = self._qualified_proto
158
if self._qualified_proto != self._proto:
159
abspath = rp + abspath[len(qp):]
160
if not isinstance(abspath, str):
161
# escaping must be done at a higher level
162
abspath = abspath.encode('ascii')
115
self._range_hint = 'multi'
165
117
def has(self, relpath):
166
118
raise NotImplementedError("has() is abstract on %r" % self)
171
123
:param relpath: The relative path to the file
173
125
code, response_file = self._get(relpath, None)
126
# FIXME: some callers want an iterable... One step forward, three steps
127
# backwards :-/ And not only an iterable, but an iterable that can be
128
# seeked backwards, so we will never be able to do that. One such
129
# known client is bzrlib.bundle.serializer.v4.get_bundle_reader. At the
130
# time of this writing it's even the only known client -- vila20071203
131
return StringIO(response_file.read())
176
def _get(self, relpath, ranges):
133
def _get(self, relpath, ranges, tail_amount=0):
177
134
"""Get a file, or part of a file.
179
136
:param relpath: Path relative to transport base URL
180
:param byte_range: None to get the whole file;
181
or [(start,end)] to fetch parts of a file.
137
:param ranges: None to get the whole file;
138
or a list of _CoalescedOffset to fetch parts of a file.
139
:param tail_amount: The amount to get from the end of the file.
183
141
:returns: (http_code, result_file)
185
Note that the current http implementations can only fetch one range at
186
a time through this call.
188
143
raise NotImplementedError(self._get)
190
def readv(self, relpath, offsets):
145
def _remote_path(self, relpath):
146
"""See ConnectedTransport._remote_path.
148
user and passwords are not embedded in the path provided to the server.
150
relative = urlutils.unescape(relpath).encode('utf-8')
151
path = self._combine_paths(self._path, relative)
152
return self._unsplit_url(self._unqualified_scheme,
153
None, None, self._host, self._port, path)
155
def _create_auth(self):
156
"""Returns a dict returning the credentials provided at build time."""
157
auth = dict(host=self._host, port=self._port,
158
user=self._user, password=self._password,
159
protocol=self._unqualified_scheme,
163
def get_request(self):
164
return SmartClientHTTPMediumRequest(self)
166
def get_smart_medium(self):
167
"""See Transport.get_smart_medium.
169
HttpTransportBase directly implements the minimal interface of
170
SmartMediumClient, so this returns self.
174
def _degrade_range_hint(self, relpath, ranges, exc_info):
175
if self._range_hint == 'multi':
176
self._range_hint = 'single'
177
mutter('Retry "%s" with single range request' % relpath)
178
elif self._range_hint == 'single':
179
self._range_hint = None
180
mutter('Retry "%s" without ranges' % relpath)
182
# We tried all the tricks, but nothing worked. We re-raise the
183
# original exception; the 'mutter' calls above will indicate that
184
# further tries were unsuccessful
185
raise exc_info[0], exc_info[1], exc_info[2]
187
# _coalesce_offsets is a helper for readv, it try to combine ranges without
188
# degrading readv performances. _bytes_to_read_before_seek is the value
189
# used for the limit parameter and has been tuned for other transports. For
190
# HTTP, the name is inappropriate but the parameter is still useful and
191
# helps reduce the number of chunks in the response. The overhead for a
192
# chunk (headers, length, footer around the data itself is variable but
193
# around 50 bytes. We use 128 to reduce the range specifiers that appear in
194
# the header, some servers (notably Apache) enforce a maximum length for a
195
# header and issue a '400: Bad request' error when too much ranges are
197
_bytes_to_read_before_seek = 128
198
# No limit on the offset number that get combined into one, we are trying
199
# to avoid downloading the whole file.
200
_max_readv_combine = 0
201
# By default Apache has a limit of ~400 ranges before replying with a 400
202
# Bad Request. So we go underneath that amount to be safe.
203
_max_get_ranges = 200
204
# We impose no limit on the range size. But see _pycurl.py for a different
208
def _readv(self, relpath, offsets):
191
209
"""Get parts of the file at the given relative path.
193
211
:param offsets: A list of (offset, size) tuples.
194
212
:param return: A list or generator of (offset, data) tuples
196
# Ideally we would pass one big request asking for all the ranges in
197
# one go; however then the server will give a multipart mime response
198
# back, and we can't parse them yet. So instead we just get one range
199
# per region, and try to coallesce the regions as much as possible.
201
# The read-coallescing code is not quite regular enough to have a
202
# single driver routine and
203
# helper method in Transport.
204
def do_combined_read(combined_offsets):
205
# read one coalesced block
207
for offset, size in combined_offsets:
209
mutter('readv coalesced %d reads.', len(combined_offsets))
210
offset = combined_offsets[0][0]
211
byte_range = (offset, offset + total_size - 1)
212
code, result_file = self._get(relpath, [byte_range])
214
for off, size in combined_offsets:
215
result_bytes = result_file.read(size)
216
assert len(result_bytes) == size
217
yield off, result_bytes
219
data = result_file.read(offset + total_size)[offset:offset + total_size]
221
for offset, size in combined_offsets:
222
yield offset, data[pos:pos + size]
227
pending_offsets = deque(offsets)
228
combined_offsets = []
229
while len(pending_offsets):
230
offset, size = pending_offsets.popleft()
231
if not combined_offsets:
232
combined_offsets = [[offset, size]]
215
# offsets may be a generator, we will iterate it several times, so
217
offsets = list(offsets)
220
retried_offset = None
224
# Coalesce the offsets to minimize the GET requests issued
225
sorted_offsets = sorted(offsets)
226
coalesced = self._coalesce_offsets(
227
sorted_offsets, limit=self._max_readv_combine,
228
fudge_factor=self._bytes_to_read_before_seek,
229
max_size=self._get_max_size)
231
# Turn it into a list, we will iterate it several times
232
coalesced = list(coalesced)
233
mutter('http readv of %s offsets => %s collapsed %s',
234
relpath, len(offsets), len(coalesced))
236
# Cache the data read, but only until it's been used
238
# We will iterate on the data received from the GET requests and
239
# serve the corresponding offsets respecting the initial order. We
240
# need an offset iterator for that.
241
iter_offsets = iter(offsets)
242
cur_offset_and_size = iter_offsets.next()
245
for cur_coal, rfile in self._coalesce_readv(relpath, coalesced):
246
# Split the received chunk
247
for offset, size in cur_coal.ranges:
248
start = cur_coal.start + offset
250
data = rfile.read(size)
253
raise errors.ShortReadvError(relpath, start, size,
255
if (start, size) == cur_offset_and_size:
256
# The offset requested are sorted as the coalesced
257
# ones, no need to cache. Win !
258
yield cur_offset_and_size[0], data
259
cur_offset_and_size = iter_offsets.next()
261
# Different sorting. We need to cache.
262
data_map[(start, size)] = data
264
# Yield everything we can
265
while cur_offset_and_size in data_map:
266
# Clean the cached data since we use it
267
# XXX: will break if offsets contains duplicates --
269
this_data = data_map.pop(cur_offset_and_size)
270
yield cur_offset_and_size[0], this_data
271
cur_offset_and_size = iter_offsets.next()
273
except (errors.ShortReadvError, errors.InvalidRange,
274
errors.InvalidHttpRange), e:
275
mutter('Exception %r: %s during http._readv',e, e)
276
if (not isinstance(e, errors.ShortReadvError)
277
or retried_offset == cur_offset_and_size):
278
# We don't degrade the range hint for ShortReadvError since
279
# they do not indicate a problem with the server ability to
280
# handle ranges. Except when we fail to get back a required
281
# offset twice in a row. In that case, falling back to
282
# single range or whole file should help or end up in a
284
self._degrade_range_hint(relpath, coalesced, sys.exc_info())
285
# Some offsets may have been already processed, so we retry
286
# only the unsuccessful ones.
287
offsets = [cur_offset_and_size] + [o for o in iter_offsets]
288
retried_offset = cur_offset_and_size
291
def _coalesce_readv(self, relpath, coalesced):
292
"""Issue several GET requests to satisfy the coalesced offsets"""
294
def get_and_yield(relpath, coalesced):
296
# Note that the _get below may raise
297
# errors.InvalidHttpRange. It's the caller's responsibility to
298
# decide how to retry since it may provide different coalesced
300
code, rfile = self._get(relpath, coalesced)
301
for coal in coalesced:
304
if self._range_hint is None:
305
# Download whole file
306
for c, rfile in get_and_yield(relpath, coalesced):
309
total = len(coalesced)
310
if self._range_hint == 'multi':
311
max_ranges = self._max_get_ranges
312
elif self._range_hint == 'single':
234
if (len (combined_offsets) < 500 and
235
combined_offsets[-1][0] + combined_offsets[-1][1] == offset):
237
combined_offsets.append([offset, size])
315
raise AssertionError("Unknown _range_hint %r"
316
% (self._range_hint,))
317
# TODO: Some web servers may ignore the range requests and return
318
# the whole file, we may want to detect that and avoid further
320
# Hint: test_readv_multiple_get_requests will fail once we do that
323
for coal in coalesced:
324
if ((self._get_max_size > 0
325
and cumul + coal.length > self._get_max_size)
326
or len(ranges) >= max_ranges):
327
# Get that much and yield
328
for c, rfile in get_and_yield(relpath, ranges):
330
# Restart with the current offset
239
# incompatible, or over the threshold issue a read and yield
240
pending_offsets.appendleft((offset, size))
241
for result in do_combined_read(combined_offsets):
243
combined_offsets = []
244
# whatever is left is a single coalesced request
245
if len(combined_offsets):
246
for result in do_combined_read(combined_offsets):
249
def put(self, relpath, f, mode=None):
250
"""Copy the file-like or string object into the location.
336
# Get the rest and yield
337
for c, rfile in get_and_yield(relpath, ranges):
340
def recommended_page_size(self):
341
"""See Transport.recommended_page_size().
343
For HTTP we suggest a large page size to reduce the overhead
344
introduced by latency.
348
def _post(self, body_bytes):
349
"""POST body_bytes to .bzr/smart on this transport.
351
:returns: (response code, response body file-like object).
353
# TODO: Requiring all the body_bytes to be available at the beginning of
354
# the POST may require large client buffers. It would be nice to have
355
# an interface that allows streaming via POST when possible (and
356
# degrades to a local buffer when not).
357
raise NotImplementedError(self._post)
359
def put_file(self, relpath, f, mode=None):
360
"""Copy the file-like object into the location.
252
362
:param relpath: Location to put the contents, relative to base.
253
:param f: File-like or string object.
363
:param f: File-like object.
255
raise TransportNotPossible('http PUT not supported')
365
raise errors.TransportNotPossible('http PUT not supported')
257
367
def mkdir(self, relpath, mode=None):
258
368
"""Create a directory at the given path."""
259
raise TransportNotPossible('http does not support mkdir()')
369
raise errors.TransportNotPossible('http does not support mkdir()')
261
371
def rmdir(self, relpath):
262
372
"""See Transport.rmdir."""
263
raise TransportNotPossible('http does not support rmdir()')
373
raise errors.TransportNotPossible('http does not support rmdir()')
265
def append(self, relpath, f):
375
def append_file(self, relpath, f, mode=None):
266
376
"""Append the text in the file-like object into the final
269
raise TransportNotPossible('http does not support append()')
379
raise errors.TransportNotPossible('http does not support append()')
271
381
def copy(self, rel_from, rel_to):
272
382
"""Copy the item at rel_from to the location at rel_to"""
273
raise TransportNotPossible('http does not support copy()')
383
raise errors.TransportNotPossible('http does not support copy()')
275
385
def copy_to(self, relpaths, other, mode=None, pb=None):
276
386
"""Copy a set of entries from self into another Transport.
330
446
:return: A lock object, which should be passed to Transport.unlock()
332
raise TransportNotPossible('http does not support lock_write()')
448
raise errors.TransportNotPossible('http does not support lock_write()')
334
450
def clone(self, offset=None):
335
451
"""Return a new HttpTransportBase with root at self.base + offset
336
For now HttpTransportBase does not actually connect, so just return
337
a new HttpTransportBase object.
453
We leave the daughter classes take advantage of the hint
454
that it's a cloning not a raw creation.
339
456
if offset is None:
340
return self.__class__(self.base)
457
return self.__class__(self.base, self)
342
return self.__class__(self.abspath(offset))
344
#---------------- test server facilities ----------------
345
# TODO: load these only when running tests
348
class WebserverNotAvailable(Exception):
352
class BadWebserverPath(ValueError):
354
return 'path %s is not in %s' % self.args
357
class TestingHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
359
def log_message(self, format, *args):
360
self.server.test_case.log('webserver - %s - - [%s] %s "%s" "%s"',
361
self.address_string(),
362
self.log_date_time_string(),
364
self.headers.get('referer', '-'),
365
self.headers.get('user-agent', '-'))
367
def handle_one_request(self):
368
"""Handle a single HTTP request.
370
You normally don't need to override this method; see the class
371
__doc__ string for information on how to handle specific HTTP
372
commands such as GET and POST.
459
return self.__class__(self.abspath(offset), self)
461
def _attempted_range_header(self, offsets, tail_amount):
462
"""Prepare a HTTP Range header at a level the server should accept.
464
:return: the range header representing offsets/tail_amount or None if
465
no header can be built.
375
for i in xrange(1,11): # Don't try more than 10 times
377
self.raw_requestline = self.rfile.readline()
378
except socket.error, e:
379
if e.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
380
# omitted for now because some tests look at the log of
381
# the server and expect to see no errors. see recent
382
# email thread. -- mbp 20051021.
383
## self.log_message('EAGAIN (%d) while reading from raw_requestline' % i)
468
if self._range_hint == 'multi':
469
# Generate the header describing all offsets
470
return self._range_header(offsets, tail_amount)
471
elif self._range_hint == 'single':
472
# Combine all the requested ranges into a single
475
if tail_amount not in (0, None):
476
# Nothing we can do here to combine ranges with tail_amount
477
# in a single range, just returns None. The whole file
478
# should be downloaded.
481
start = offsets[0].start
483
end = last.start + last.length - 1
484
whole = self._coalesce_offsets([(start, end - start + 1)],
485
limit=0, fudge_factor=0)
486
return self._range_header(list(whole), 0)
389
if not self.raw_requestline:
390
self.close_connection = 1
392
if not self.parse_request(): # An error code has been sent, just exit
394
mname = 'do_' + self.command
395
if not hasattr(self, mname):
396
self.send_error(501, "Unsupported method (%r)" % self.command)
398
method = getattr(self, mname)
402
class TestingHTTPServer(BaseHTTPServer.HTTPServer):
403
def __init__(self, server_address, RequestHandlerClass, test_case):
404
BaseHTTPServer.HTTPServer.__init__(self, server_address,
406
self.test_case = test_case
408
class HttpServer(Server):
409
"""A test server for http transports."""
411
# used to form the url that connects to this server
412
_url_protocol = 'http'
414
def _http_start(self):
416
httpd = TestingHTTPServer(('localhost', 0),
417
TestingHTTPRequestHandler,
419
host, port = httpd.socket.getsockname()
420
self._http_base_url = '%s://localhost:%s/' % (self._url_protocol, port)
421
self._http_starting.release()
422
httpd.socket.settimeout(0.1)
424
while self._http_running:
426
httpd.handle_request()
427
except socket.timeout:
430
def _get_remote_url(self, path):
431
path_parts = path.split(os.path.sep)
432
if os.path.isabs(path):
433
if path_parts[:len(self._local_path_parts)] != \
434
self._local_path_parts:
435
raise BadWebserverPath(path, self.test_dir)
436
remote_path = '/'.join(path_parts[len(self._local_path_parts):])
488
# Only tail_amount, requested, leave range_header
490
return self._range_header(offsets, tail_amount)
438
remote_path = '/'.join(path_parts)
440
self._http_starting.acquire()
441
self._http_starting.release()
442
return self._http_base_url + remote_path
444
def log(self, format, *args):
445
"""Capture Server log output."""
446
self.logs.append(format % args)
449
"""See bzrlib.transport.Server.setUp."""
450
self._home_dir = os.getcwdu()
451
self._local_path_parts = self._home_dir.split(os.path.sep)
452
self._http_starting = threading.Lock()
453
self._http_starting.acquire()
454
self._http_running = True
455
self._http_base_url = None
456
self._http_thread = threading.Thread(target=self._http_start)
457
self._http_thread.setDaemon(True)
458
self._http_thread.start()
459
self._http_proxy = os.environ.get("http_proxy")
460
if self._http_proxy is not None:
461
del os.environ["http_proxy"]
465
"""See bzrlib.transport.Server.tearDown."""
466
self._http_running = False
467
self._http_thread.join()
468
if self._http_proxy is not None:
470
os.environ["http_proxy"] = self._http_proxy
473
"""See bzrlib.transport.Server.get_url."""
474
return self._get_remote_url(self._home_dir)
476
def get_bogus_url(self):
477
"""See bzrlib.transport.Server.get_bogus_url."""
478
# this is chosen to try to prevent trouble with proxies, wierd dns,
480
return 'http://127.0.0.1:1/'
495
def _range_header(ranges, tail_amount):
496
"""Turn a list of bytes ranges into a HTTP Range header value.
498
:param ranges: A list of _CoalescedOffset
499
:param tail_amount: The amount to get from the end of the file.
501
:return: HTTP range header string.
503
At least a non-empty ranges *or* a tail_amount must be
507
for offset in ranges:
508
strings.append('%d-%d' % (offset.start,
509
offset.start + offset.length - 1))
512
strings.append('-%d' % tail_amount)
514
return ','.join(strings)
516
def send_http_smart_request(self, bytes):
518
code, body_filelike = self._post(bytes)
520
raise InvalidHttpResponse(
521
self._remote_path('.bzr/smart'),
522
'Expected 200 response code, got %r' % (code,))
523
except errors.InvalidHttpResponse, e:
524
raise errors.SmartProtocolError(str(e))
527
def should_probe(self):
530
def remote_path_from_transport(self, transport):
531
# Strip the optional 'bzr+' prefix from transport so it will have the
532
# same scheme as self.
533
transport_base = transport.base
534
if transport_base.startswith('bzr+'):
535
transport_base = transport_base[4:]
536
rel_url = urlutils.relative_url(self.base, transport_base)
537
return urllib.unquote(rel_url)
540
class SmartClientHTTPMediumRequest(medium.SmartClientMediumRequest):
541
"""A SmartClientMediumRequest that works with an HTTP medium."""
543
def __init__(self, client_medium):
544
medium.SmartClientMediumRequest.__init__(self, client_medium)
547
def _accept_bytes(self, bytes):
548
self._buffer += bytes
550
def _finished_writing(self):
551
data = self._medium.send_http_smart_request(self._buffer)
552
self._response_body = data
554
def _read_bytes(self, count):
555
"""See SmartClientMediumRequest._read_bytes."""
556
return self._response_body.read(count)
558
def _read_line(self):
559
line, excess = medium._get_line(self._response_body.read)
561
raise AssertionError(
562
'_get_line returned excess bytes, but this mediumrequest '
563
'cannot handle excess. (%r)' % (excess,))
566
def _finished_reading(self):
567
"""See SmartClientMediumRequest._finished_reading."""