24
from __future__ import absolute_import
29
import http.client as http_client
30
except ImportError: # python < 3
31
import httplib as http_client
33
import email.utils as email_utils
34
except ImportError: # python < 3
35
import rfc822 as email_utils
26
from cStringIO import StringIO
41
from ...sixish import (
47
class ResponseFile(object):
48
"""A wrapper around the http socket containing the result of a GET request.
50
Only read() and seek() (forward) are supported.
54
def __init__(self, path, infile):
57
:param path: File url, for error reports.
59
:param infile: File-like socket set at body start.
68
Dummy implementation for consistency with the 'file' API.
74
def __exit__(self, exc_type, exc_val, exc_tb):
75
return False # propogate exceptions.
77
def read(self, size=None):
78
"""Read size bytes from the current position in the file.
80
:param size: The number of bytes to read. Leave unspecified or pass
83
if size is None and not PY3:
85
data = self._file.read(size)
86
self._pos += len(data)
90
data = self._file.readline()
91
self._pos += len(data)
96
line = self.readline()
104
def seek(self, offset, whence=os.SEEK_SET):
105
if whence == os.SEEK_SET:
106
if offset < self._pos:
107
raise AssertionError(
108
"Can't seek backwards, pos: %s, offset: %s"
109
% (self._pos, offset))
110
to_discard = offset - self._pos
111
elif whence == os.SEEK_CUR:
114
raise AssertionError("Can't seek backwards")
116
# Just discard the unwanted bytes
117
self.read(to_discard)
119
36
# A RangeFile expects the following grammar (simplified to outline the
120
37
# assumptions we rely upon).
123
41
# | multiple_range
43
# whole_file: [content_length_header] data
125
45
# single_range: content_range_header data
127
47
# multiple_range: boundary_header boundary (content_range_header data boundary)+
130
class RangeFile(ResponseFile):
49
class RangeFile(object):
131
50
"""File-like object that allow access to partial available data.
133
52
All accesses should happen sequentially since the acquisition occurs during
142
61
# in _checked_read() below, we may have to discard several MB in the worst
143
62
# case. To avoid buffering that much, we read and discard by chunks
144
# instead. The underlying file is either a socket or a BytesIO, so reading
63
# instead. The underlying file is either a socket or a StringIO, so reading
145
64
# 8k chunks should be fine.
146
65
_discarded_buf_size = 8192
154
73
:param path: File url, for error reports.
156
74
:param infile: File-like socket set at body start.
158
super(RangeFile, self).__init__(path, infile)
159
78
self._boundary = None
160
79
# When using multi parts response, this will be set with the headers
161
80
# associated with the range currently read.
176
95
The file should be at the beginning of the body, the first range
177
96
definition is read and taken into account.
179
if not isinstance(boundary, bytes):
180
raise TypeError(boundary)
181
98
self._boundary = boundary
182
99
# Decode the headers and setup the first range
183
100
self.read_boundary()
186
103
def read_boundary(self):
187
104
"""Read the boundary headers defining a new range"""
188
boundary_line = b'\r\n'
189
while boundary_line == b'\r\n':
105
boundary_line = '\r\n'
106
while boundary_line == '\r\n':
190
107
# RFC2616 19.2 Additional CRLFs may precede the first boundary
192
109
# To be on the safe side we allow it before any boundary line
193
110
boundary_line = self._file.readline()
195
if boundary_line == b'':
196
# A timeout in the proxy server caused the response to end early.
197
# See launchpad bug 198646.
198
raise errors.HttpBoundaryMissing(
202
if boundary_line != b'--' + self._boundary + b'\r\n':
203
# email_utils.unquote() incorrectly unquotes strings enclosed in <>
112
if boundary_line != '--' + self._boundary + '\r\n':
113
# rfc822.unquote() incorrectly unquotes strings enclosed in <>
204
114
# IIS 6 and 7 incorrectly wrap boundary strings in <>
205
115
# together they make a beautiful bug, which we will be gracious
207
117
if (self._unquote_boundary(boundary_line) !=
208
b'--' + self._boundary + b'\r\n'):
118
'--' + self._boundary + '\r\n'):
209
119
raise errors.InvalidHttpResponse(
211
121
"Expected a boundary (%s) line, got '%s'"
212
122
% (self._boundary, boundary_line))
214
124
def _unquote_boundary(self, b):
215
return b[:2] + email_utils.unquote(b[2:-2].decode('ascii')).encode('ascii') + b[-2:]
125
return b[:2] + rfc822.unquote(b[2:-2]) + b[-2:]
217
127
def read_range_definition(self):
218
128
"""Read a new range definition in a multi parts message.
220
130
Parse the headers including the empty line following them so that we
221
131
are ready to read the data itself.
224
self._headers = http_client.parse_headers(self._file)
226
self._headers = http_client.HTTPMessage(self._file, seekable=0)
133
self._headers = httplib.HTTPMessage(self._file, seekable=0)
227
134
# Extract the range definition
228
content_range = self._headers.get('content-range', None)
135
content_range = self._headers.getheader('content-range', None)
229
136
if content_range is None:
230
137
raise errors.InvalidHttpResponse(
314
221
% (size, self._start, self._size))
316
223
# read data from file
319
226
if self._size > 0:
320
227
# Don't read past the range definition
321
228
limited = self._start + self._size - self._pos
323
230
limited = min(limited, size)
324
osutils.pumpfile(self._file, buf, limited, self._max_read_size)
325
data = buf.getvalue()
231
osutils.pumpfile(self._file, buffer, limited, self._max_read_size)
232
data = buffer.getvalue()
327
234
# Update _pos respecting the data effectively read
328
235
self._pos += len(data)
362
269
cur_limit = self._start + self._size
364
271
size = final_pos - self._pos
365
if size > 0: # size can be < 0 if we crossed a range boundary
272
if size > 0: # size can be < 0 if we crossed a range boundary
366
273
# We don't need the data, just read it and throw it away
367
274
self._checked_read(size)
373
def handle_response(url, code, getheader, data):
280
def handle_response(url, code, msg, data):
374
281
"""Interpret the code & headers and wrap the provided data in a RangeFile.
376
283
This is a factory method which returns an appropriate RangeFile based on
379
286
:param url: The url being processed. Mostly for error reporting
380
287
:param code: The integer HTTP response code
381
:param getheader: Function for retrieving header
288
:param msg: An HTTPMessage containing the headers for the response
382
289
:param data: A file-like object that can be read() to get the
384
291
:return: A file-like object that can seek()+read() the
385
292
ranges indicated by the headers.
294
rfile = RangeFile(url, data)
389
rfile = ResponseFile(url, data)
297
size = msg.getheader('content-length', None)
302
rfile.set_range(0, size)
390
303
elif code == 206:
391
rfile = RangeFile(url, data)
392
# When there is no content-type header we treat the response as
393
# being of type 'application/octet-stream' as per RFC2616 section
395
# Therefore it is obviously not multipart
396
content_type = getheader('content-type', 'application/octet-stream')
397
mimetype, options = cgi.parse_header(content_type)
398
if mimetype == 'multipart/byteranges':
399
rfile.set_boundary(options['boundary'].encode('ascii'))
304
content_type = msg.getheader('content-type', None)
305
if content_type is None:
306
# When there is no content-type header we treat the response as
307
# being of type 'application/octet-stream' as per RFC2616 section
309
# Therefore it is obviously not multipart
310
content_type = 'application/octet-stream'
313
is_multipart = (msg.getmaintype() == 'multipart'
314
and msg.getsubtype() == 'byteranges')
317
# Full fledged multipart response
318
rfile.set_boundary(msg.getparam('boundary'))
401
320
# A response to a range request, but not multipart
402
content_range = getheader('content-range', None)
321
content_range = msg.getheader('content-range', None)
403
322
if content_range is None:
404
raise errors.InvalidHttpResponse(
405
url, 'Missing the Content-Range header in a 206 range response')
323
raise errors.InvalidHttpResponse(url,
324
'Missing the Content-Range header in a 206 range response')
406
325
rfile.set_range_from_header(content_range)
408
327
raise errors.InvalidHttpResponse(url,
409
328
'Unknown response code %s' % code)