bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
1 |
# Copyright (C) 2006, 2007 Canonical Ltd
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
2 |
#
|
3 |
# This program is free software; you can redistribute it and/or modify
|
|
4 |
# it under the terms of the GNU General Public License as published by
|
|
5 |
# the Free Software Foundation; either version 2 of the License, or
|
|
6 |
# (at your option) any later version.
|
|
7 |
#
|
|
8 |
# This program is distributed in the hope that it will be useful,
|
|
9 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11 |
# GNU General Public License for more details.
|
|
12 |
#
|
|
13 |
# You should have received a copy of the GNU General Public License
|
|
14 |
# along with this program; if not, write to the Free Software
|
|
15 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
16 |
||
17 |
"""Handlers for HTTP Responses.
|
|
18 |
||
19 |
The purpose of these classes is to provide a uniform interface for clients
|
|
20 |
to standard HTTP responses, single range responses and multipart range
|
|
21 |
responses.
|
|
22 |
"""
|
|
23 |
||
24 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
25 |
import httplib |
26 |
||
27 |
from bzrlib import ( |
|
28 |
errors, |
|
29 |
trace, |
|
30 |
)
|
|
31 |
||
32 |
||
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
33 |
# A RangeFile expects the following grammar (simplified to outline the
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
34 |
# assumptions we rely upon).
|
35 |
||
36 |
# file: whole_file
|
|
37 |
# | single_range
|
|
38 |
# | multiple_range
|
|
39 |
||
40 |
# whole_file: [content_length_header] data
|
|
41 |
||
42 |
# single_range: content_range_header data
|
|
43 |
||
44 |
# multiple_range: boundary_header boundary (content_range_header data boundary)+
|
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
45 |
|
46 |
class RangeFile(object): |
|
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
47 |
"""File-like object that allow access to partial available data. |
48 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
49 |
All accesses should happen sequentially since the acquisition occurs during
|
50 |
an http response reception (as sockets can't be seeked, we simulate the
|
|
51 |
seek by just reading and discarding the data).
|
|
52 |
||
53 |
The access pattern is defined by a set of ranges discovered as reading
|
|
54 |
progress. Only one range is available at a given time, so all accesses
|
|
55 |
should happen with monotonically increasing offsets.
|
|
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
56 |
"""
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
57 |
|
|
3146.3.4
by Vincent Ladeuil
Review feedback, simpler loops. |
58 |
# in _checked_read() below, we may have to discard several MB in the worst
|
59 |
# case. To avoid buffering that much, we read and discard by chunks
|
|
60 |
# instead. The underlying file is either a socket or a StringIO, so reading
|
|
61 |
# 8k chunks should be fine.
|
|
62 |
_discarded_buf_size = 8192 |
|
63 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
64 |
def __init__(self, path, infile): |
65 |
"""Constructor. |
|
66 |
||
67 |
:param path: File url, for error reports.
|
|
68 |
:param infile: File-like socket set at body start.
|
|
69 |
"""
|
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
70 |
self._path = path |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
71 |
self._file = infile |
72 |
self._boundary = None |
|
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
73 |
# When using multi parts response, this will be set with the headers
|
74 |
# associated with the range currently read.
|
|
75 |
self._headers = None |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
76 |
# Default to the whole file of unspecified size
|
77 |
self.set_range(0, -1) |
|
78 |
||
79 |
def set_range(self, start, size): |
|
80 |
"""Change the range mapping""" |
|
81 |
self._start = start |
|
82 |
self._size = size |
|
83 |
# Set the new _pos since that's what we want to expose
|
|
84 |
self._pos = self._start |
|
85 |
||
86 |
def set_boundary(self, boundary): |
|
87 |
"""Define the boundary used in a multi parts message. |
|
88 |
|
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
89 |
The file should be at the beginning of the body, the first range
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
90 |
definition is read and taken into account.
|
91 |
"""
|
|
92 |
self._boundary = boundary |
|
93 |
# Decode the headers and setup the first range
|
|
94 |
self.read_boundary() |
|
95 |
self.read_range_definition() |
|
96 |
||
97 |
def read_boundary(self): |
|
98 |
"""Read the boundary headers defining a new range""" |
|
99 |
boundary_line = '\r\n' |
|
100 |
while boundary_line == '\r\n': |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
101 |
# RFC2616 19.2 Additional CRLFs may precede the first boundary
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
102 |
# string entity.
|
103 |
# To be on the safe side we allow it before any boundary line
|
|
104 |
boundary_line = self._file.readline() |
|
105 |
if boundary_line != '--' + self._boundary + '\r\n': |
|
106 |
raise errors.InvalidHttpResponse( |
|
107 |
self._path, |
|
108 |
"Expected a boundary (%s) line, got '%s'" % (self._boundary, |
|
109 |
boundary_line)) |
|
110 |
||
111 |
def read_range_definition(self): |
|
112 |
"""Read a new range definition in a multi parts message. |
|
113 |
||
114 |
Parse the headers including the empty line following them so that we
|
|
115 |
are ready to read the data itself.
|
|
116 |
"""
|
|
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
117 |
self._headers = httplib.HTTPMessage(self._file, seekable=0) |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
118 |
# Extract the range definition
|
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
119 |
content_range = self._headers.getheader('content-range', None) |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
120 |
if content_range is None: |
121 |
raise errors.InvalidHttpResponse( |
|
122 |
self._path, |
|
123 |
'Content-Range header missing in a multi-part response') |
|
124 |
self.set_range_from_header(content_range) |
|
125 |
||
126 |
def set_range_from_header(self, content_range): |
|
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
127 |
"""Helper to set the new range from its description in the headers""" |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
128 |
try: |
129 |
rtype, values = content_range.split() |
|
|
3059.2.10
by Vincent Ladeuil
Jam's review feedback. |
130 |
except ValueError: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
131 |
raise errors.InvalidHttpRange(self._path, content_range, |
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
132 |
'Malformed header') |
|
3059.2.11
by Vincent Ladeuil
Fix typos mentioned by spiv. |
133 |
if rtype != 'bytes': |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
134 |
raise errors.InvalidHttpRange(self._path, content_range, |
135 |
"Unsupported range type '%s'" % rtype) |
|
136 |
try: |
|
137 |
# We don't need total, but note that it may be either the file size
|
|
138 |
# or '*' if the server can't or doesn't want to return the file
|
|
139 |
# size.
|
|
140 |
start_end, total = values.split('/') |
|
141 |
start, end = start_end.split('-') |
|
142 |
start = int(start) |
|
143 |
end = int(end) |
|
|
3059.2.10
by Vincent Ladeuil
Jam's review feedback. |
144 |
except ValueError: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
145 |
raise errors.InvalidHttpRange(self._path, content_range, |
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
146 |
'Invalid range values') |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
147 |
size = end - start + 1 |
148 |
if size <= 0: |
|
149 |
raise errors.InvalidHttpRange(self._path, content_range, |
|
|
3059.2.17
by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges. |
150 |
'Invalid range, size <= 0') |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
151 |
self.set_range(start, size) |
152 |
||
153 |
def _checked_read(self, size): |
|
|
3146.3.2
by Vincent Ladeuil
Fix #179368 by keeping the current range hint on ShortReadvErrors. |
154 |
"""Read the file checking for short reads. |
155 |
||
156 |
The data read is discarded along the way.
|
|
157 |
"""
|
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
158 |
pos = self._pos |
|
3146.3.4
by Vincent Ladeuil
Review feedback, simpler loops. |
159 |
remaining = size |
160 |
while remaining > 0: |
|
161 |
data = self._file.read(min(remaining, self._discarded_buf_size)) |
|
162 |
remaining -= len(data) |
|
163 |
if not data: |
|
164 |
raise errors.ShortReadvError(self._path, pos, size, |
|
165 |
size - remaining) |
|
166 |
self._pos += size |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
167 |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
168 |
def _seek_to_next_range(self): |
169 |
# We will cross range boundaries
|
|
170 |
if self._boundary is None: |
|
171 |
# If we don't have a boundary, we can't find another range
|
|
|
3146.3.2
by Vincent Ladeuil
Fix #179368 by keeping the current range hint on ShortReadvErrors. |
172 |
raise errors.InvalidRange(self._path, self._pos, |
173 |
"Range (%s, %s) exhausted" |
|
174 |
% (self._start, self._size)) |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
175 |
self.read_boundary() |
176 |
self.read_range_definition() |
|
177 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
178 |
def read(self, size=-1): |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
179 |
"""Read size bytes from the current position in the file. |
180 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
181 |
Reading across ranges is not supported. We rely on the underlying http
|
182 |
client to clean the socket if we leave bytes unread. This may occur for
|
|
183 |
the final boundary line of a multipart response or for any range
|
|
184 |
request not entirely consumed by the client (due to offset coalescing)
|
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
185 |
"""
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
186 |
if (self._size > 0 |
187 |
and self._pos == self._start + self._size): |
|
188 |
if size == 0: |
|
189 |
return '' |
|
190 |
else: |
|
191 |
self._seek_to_next_range() |
|
192 |
elif self._pos < self._start: |
|
193 |
raise errors.InvalidRange( |
|
194 |
self._path, self._pos, |
|
195 |
"Can't read %s bytes before range (%s, %s)" |
|
196 |
% (size, self._start, self._size)) |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
197 |
if self._size > 0: |
198 |
if size > 0 and self._pos + size > self._start + self._size: |
|
199 |
raise errors.InvalidRange( |
|
200 |
self._path, self._pos, |
|
201 |
"Can't read %s bytes across range (%s, %s)" |
|
202 |
% (size, self._start, self._size)) |
|
203 |
||
204 |
if self._size > 0: |
|
205 |
# Don't read past the range definition
|
|
206 |
limited = self._start + self._size - self._pos |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
207 |
if size >= 0: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
208 |
limited = min(limited, size) |
209 |
data = self._file.read(limited) |
|
210 |
else: |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
211 |
# Size of file unknown, the user may have specified a size or not,
|
212 |
# we delegate that to the filesocket object (-1 means read until
|
|
213 |
# EOF)
|
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
214 |
data = self._file.read(size) |
215 |
# Update _pos respecting the data effectively read
|
|
216 |
self._pos += len(data) |
|
217 |
return data |
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
218 |
|
219 |
def seek(self, offset, whence=0): |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
220 |
start_pos = self._pos |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
221 |
if whence == 0: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
222 |
final_pos = offset |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
223 |
elif whence == 1: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
224 |
final_pos = start_pos + offset |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
225 |
elif whence == 2: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
226 |
if self._size > 0: |
|
3059.2.14
by Vincent Ladeuil
Complete coverage by adding tests for more invalid inputs. Fix a |
227 |
final_pos = self._start + self._size + offset # offset < 0 |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
228 |
else: |
229 |
raise errors.InvalidRange( |
|
230 |
self._path, self._pos, |
|
|
3059.2.14
by Vincent Ladeuil
Complete coverage by adding tests for more invalid inputs. Fix a |
231 |
"RangeFile: can't seek from end while size is unknown") |
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
232 |
else: |
233 |
raise ValueError("Invalid value %s for whence." % whence) |
|
234 |
||
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
235 |
if final_pos < self._pos: |
236 |
# Can't seek backwards
|
|
237 |
raise errors.InvalidRange( |
|
238 |
self._path, self._pos, |
|
239 |
'RangeFile: trying to seek backwards to %s' % final_pos) |
|
240 |
||
241 |
if self._size > 0: |
|
242 |
cur_limit = self._start + self._size |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
243 |
while final_pos > cur_limit: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
244 |
# We will cross range boundaries
|
245 |
remain = cur_limit - self._pos |
|
246 |
if remain > 0: |
|
247 |
# Finish reading the current range
|
|
248 |
self._checked_read(remain) |
|
|
3059.2.18
by Vincent Ladeuil
Take spiv review comments into account. |
249 |
self._seek_to_next_range() |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
250 |
cur_limit = self._start + self._size |
251 |
||
252 |
size = final_pos - self._pos |
|
253 |
if size > 0: # size can be < 0 if we crossed a range boundary |
|
254 |
# We don't need the data, just read it and throw it away
|
|
255 |
self._checked_read(size) |
|
|
1750.1.2
by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib. |
256 |
|
|
1786.1.8
by John Arbash Meinel
[merge] Johan Rydberg test updates |
257 |
def tell(self): |
258 |
return self._pos |
|
259 |
||
|
1786.1.5
by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object. |
260 |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
261 |
def handle_response(url, code, msg, data): |
262 |
"""Interpret the code & headers and wrap the provided data in a RangeFile. |
|
263 |
||
264 |
This is a factory method which returns an appropriate RangeFile based on
|
|
265 |
the code & headers it's given.
|
|
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
266 |
|
267 |
:param url: The url being processed. Mostly for error reporting
|
|
268 |
:param code: The integer HTTP response code
|
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
269 |
:param msg: An HTTPMessage containing the headers for the response
|
|
1786.1.27
by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration. |
270 |
:param data: A file-like object that can be read() to get the
|
271 |
requested data
|
|
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
272 |
:return: A file-like object that can seek()+read() the
|
273 |
ranges indicated by the headers.
|
|
274 |
"""
|
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
275 |
rfile = RangeFile(url, data) |
276 |
if code == 200: |
|
277 |
# A whole file
|
|
278 |
size = msg.getheader('content-length', None) |
|
279 |
if size is None: |
|
280 |
size = -1 |
|
281 |
else: |
|
282 |
size = int(size) |
|
283 |
rfile.set_range(0, size) |
|
284 |
elif code == 206: |
|
285 |
content_type = msg.getheader('content-type', None) |
|
286 |
if content_type is None: |
|
287 |
# When there is no content-type header we treat the response as
|
|
288 |
# being of type 'application/octet-stream' as per RFC2616 section
|
|
289 |
# 7.2.1.
|
|
|
2073.1.1
by John Arbash Meinel
Robert's comments: Refer to RFC2616 to explain how we handle missing Content-Type |
290 |
# Therefore it is obviously not multipart
|
291 |
content_type = 'application/octet-stream' |
|
|
2070.1.1
by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses |
292 |
is_multipart = False |
293 |
else: |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
294 |
is_multipart = (msg.getmaintype() == 'multipart' |
295 |
and msg.getsubtype() == 'byteranges') |
|
|
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
296 |
|
|
2070.1.1
by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses |
297 |
if is_multipart: |
|
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
298 |
# Full fledged multipart response
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
299 |
rfile.set_boundary(msg.getparam('boundary')) |
|
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
300 |
else: |
301 |
# A response to a range request, but not multipart
|
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
302 |
content_range = msg.getheader('content-range', None) |
303 |
if content_range is None: |
|
|
1786.1.26
by John Arbash Meinel
Update and test handle_response. |
304 |
raise errors.InvalidHttpResponse(url, |
305 |
'Missing the Content-Range header in a 206 range response') |
|
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
306 |
rfile.set_range_from_header(content_range) |
|
1786.1.40
by John Arbash Meinel
code cleanups from Martin Pool. |
307 |
else: |
|
3059.2.2
by Vincent Ladeuil
Read http responses on demand without buffering the whole body |
308 |
raise errors.InvalidHttpResponse(url, |
309 |
'Unknown response code %s' % code) |
|
310 |
||
311 |
return rfile |
|
|
1786.1.21
by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers. |
312 |