26
from cStringIO import StringIO
24
from __future__ import absolute_import
28
import http.client as http_client
29
except ImportError: # python < 3
30
import httplib as http_client
32
import email.utils as email_utils
33
except ImportError: # python < 3
34
import rfc822 as email_utils
40
from ...sixish import (
46
class ResponseFile(object):
47
"""A wrapper around the http socket containing the result of a GET request.
49
Only read() and seek() (forward) are supported.
53
def __init__(self, path, infile):
56
:param path: File url, for error reports.
58
:param infile: File-like socket set at body start.
67
Dummy implementation for consistency with the 'file' API.
73
def __exit__(self, exc_type, exc_val, exc_tb):
74
return False # propogate exceptions.
76
def read(self, size=None):
77
"""Read size bytes from the current position in the file.
79
:param size: The number of bytes to read. Leave unspecified or pass
82
if size is None and not PY3:
84
data = self._file.read(size)
85
self._pos += len(data)
89
data = self._file.readline()
90
self._pos += len(data)
95
line = self.readline()
103
def seek(self, offset, whence=os.SEEK_SET):
104
if whence == os.SEEK_SET:
105
if offset < self._pos:
106
raise AssertionError(
107
"Can't seek backwards, pos: %s, offset: %s"
108
% (self._pos, offset))
109
to_discard = offset - self._pos
110
elif whence == os.SEEK_CUR:
113
raise AssertionError("Can't seek backwards")
115
# Just discard the unwanted bytes
116
self.read(to_discard)
36
118
# A RangeFile expects the following grammar (simplified to outline the
37
119
# assumptions we rely upon).
41
122
# | multiple_range
43
# whole_file: [content_length_header] data
45
124
# single_range: content_range_header data
47
126
# multiple_range: boundary_header boundary (content_range_header data boundary)+
49
class RangeFile(object):
129
class RangeFile(ResponseFile):
50
130
"""File-like object that allow access to partial available data.
52
132
All accesses should happen sequentially since the acquisition occurs during
61
141
# in _checked_read() below, we may have to discard several MB in the worst
62
142
# case. To avoid buffering that much, we read and discard by chunks
63
# instead. The underlying file is either a socket or a StringIO, so reading
143
# instead. The underlying file is either a socket or a BytesIO, so reading
64
144
# 8k chunks should be fine.
65
145
_discarded_buf_size = 8192
73
153
:param path: File url, for error reports.
74
155
:param infile: File-like socket set at body start.
157
super(RangeFile, self).__init__(path, infile)
78
158
self._boundary = None
79
159
# When using multi parts response, this will be set with the headers
80
160
# associated with the range currently read.
95
175
The file should be at the beginning of the body, the first range
96
176
definition is read and taken into account.
178
if not isinstance(boundary, bytes):
179
raise TypeError(boundary)
98
180
self._boundary = boundary
99
181
# Decode the headers and setup the first range
100
182
self.read_boundary()
103
185
def read_boundary(self):
104
186
"""Read the boundary headers defining a new range"""
105
boundary_line = '\r\n'
106
while boundary_line == '\r\n':
187
boundary_line = b'\r\n'
188
while boundary_line == b'\r\n':
107
189
# RFC2616 19.2 Additional CRLFs may precede the first boundary
109
191
# To be on the safe side we allow it before any boundary line
110
192
boundary_line = self._file.readline()
112
if boundary_line != '--' + self._boundary + '\r\n':
113
# rfc822.unquote() incorrectly unquotes strings enclosed in <>
194
if boundary_line == b'':
195
# A timeout in the proxy server caused the response to end early.
196
# See launchpad bug 198646.
197
raise errors.HttpBoundaryMissing(
201
if boundary_line != b'--' + self._boundary + b'\r\n':
202
# email_utils.unquote() incorrectly unquotes strings enclosed in <>
114
203
# IIS 6 and 7 incorrectly wrap boundary strings in <>
115
204
# together they make a beautiful bug, which we will be gracious
117
206
if (self._unquote_boundary(boundary_line) !=
118
'--' + self._boundary + '\r\n'):
207
b'--' + self._boundary + b'\r\n'):
119
208
raise errors.InvalidHttpResponse(
121
210
"Expected a boundary (%s) line, got '%s'"
122
211
% (self._boundary, boundary_line))
124
213
def _unquote_boundary(self, b):
125
return b[:2] + rfc822.unquote(b[2:-2]) + b[-2:]
214
return b[:2] + email_utils.unquote(b[2:-2].decode('ascii')).encode('ascii') + b[-2:]
127
216
def read_range_definition(self):
128
217
"""Read a new range definition in a multi parts message.
130
219
Parse the headers including the empty line following them so that we
131
220
are ready to read the data itself.
133
self._headers = httplib.HTTPMessage(self._file, seekable=0)
223
self._headers = http_client.parse_headers(self._file)
225
self._headers = http_client.HTTPMessage(self._file, seekable=0)
134
226
# Extract the range definition
135
content_range = self._headers.getheader('content-range', None)
227
content_range = self._headers.get('content-range', None)
136
228
if content_range is None:
137
229
raise errors.InvalidHttpResponse(
221
313
% (size, self._start, self._size))
223
315
# read data from file
226
318
if self._size > 0:
227
319
# Don't read past the range definition
228
320
limited = self._start + self._size - self._pos
230
322
limited = min(limited, size)
231
osutils.pumpfile(self._file, buffer, limited, self._max_read_size)
232
data = buffer.getvalue()
323
osutils.pumpfile(self._file, buf, limited, self._max_read_size)
324
data = buf.getvalue()
234
326
# Update _pos respecting the data effectively read
235
327
self._pos += len(data)
269
361
cur_limit = self._start + self._size
271
363
size = final_pos - self._pos
272
if size > 0: # size can be < 0 if we crossed a range boundary
364
if size > 0: # size can be < 0 if we crossed a range boundary
273
365
# We don't need the data, just read it and throw it away
274
366
self._checked_read(size)
291
383
:return: A file-like object that can seek()+read() the
292
384
ranges indicated by the headers.
294
rfile = RangeFile(url, data)
297
size = msg.getheader('content-length', None)
302
rfile.set_range(0, size)
388
rfile = ResponseFile(url, data)
303
389
elif code == 206:
304
content_type = msg.getheader('content-type', None)
390
rfile = RangeFile(url, data)
391
content_type = msg.get('content-type', None)
305
392
if content_type is None:
306
393
# When there is no content-type header we treat the response as
307
394
# being of type 'application/octet-stream' as per RFC2616 section
310
397
content_type = 'application/octet-stream'
311
398
is_multipart = False
313
is_multipart = (msg.getmaintype() == 'multipart'
314
and msg.getsubtype() == 'byteranges')
401
is_multipart = (msg.get_content_maintype() == 'multipart'
402
and msg.get_content_subtype() == 'byteranges')
404
is_multipart = (msg.getmaintype() == 'multipart'
405
and msg.getsubtype() == 'byteranges')
317
408
# Full fledged multipart response
318
rfile.set_boundary(msg.getparam('boundary'))
410
boundary = msg.get_param('boundary')
412
boundary = msg.getparam('boundary')
413
rfile.set_boundary(boundary.encode('ascii'))
320
415
# A response to a range request, but not multipart
321
content_range = msg.getheader('content-range', None)
416
content_range = msg.get('content-range', None)
322
417
if content_range is None:
323
418
raise errors.InvalidHttpResponse(url,
324
'Missing the Content-Range header in a 206 range response')
419
'Missing the Content-Range header in a 206 range response')
325
420
rfile.set_range_from_header(content_range)
327
422
raise errors.InvalidHttpResponse(url,
328
423
'Unknown response code %s' % code)