/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
1
# Copyright (C) 2006, 2007 Canonical Ltd
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Handlers for HTTP Responses.
18
19
The purpose of these classes is to provide a uniform interface for clients
20
to standard HTTP responses, single range responses and multipart range
21
responses.
22
"""
23
24
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
25
import httplib
26
27
from bzrlib import (
28
    errors,
29
    trace,
30
    )
31
32
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
33
# A RangeFile expects the following grammar (simplified to outline the
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
34
# assumptions we rely upon).
35
36
# file: whole_file
37
#     | single_range
38
#     | multiple_range
39
40
# whole_file: [content_length_header] data
41
42
# single_range: content_range_header data
43
44
# multiple_range: boundary_header boundary (content_range_header data boundary)+
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
45
46
class RangeFile(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
47
    """File-like object that allow access to partial available data.
48
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
49
    All accesses should happen sequentially since the acquisition occurs during
50
    an http response reception (as sockets can't be seeked, we simulate the
51
    seek by just reading and discarding the data).
52
53
    The access pattern is defined by a set of ranges discovered as reading
54
    progress. Only one range is available at a given time, so all accesses
55
    should happen with monotonically increasing offsets.
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
56
    """
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
57
3146.3.4 by Vincent Ladeuil
Review feedback, simpler loops.
58
    # in _checked_read() below, we may have to discard several MB in the worst
59
    # case. To avoid buffering that much, we read and discard by chunks
60
    # instead. The underlying file is either a socket or a StringIO, so reading
61
    # 8k chunks should be fine.
62
    _discarded_buf_size = 8192
63
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
64
    def __init__(self, path, infile):
65
        """Constructor.
66
67
        :param path: File url, for error reports.
68
        :param infile: File-like socket set at body start.
69
        """
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
70
        self._path = path
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
71
        self._file = infile
72
        self._boundary = None
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
73
        # When using multi parts response, this will be set with the headers
74
        # associated with the range currently read.
75
        self._headers = None
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
76
        # Default to the whole file of unspecified size
77
        self.set_range(0, -1)
78
79
    def set_range(self, start, size):
80
        """Change the range mapping"""
81
        self._start = start
82
        self._size = size
83
        # Set the new _pos since that's what we want to expose
84
        self._pos = self._start
85
86
    def set_boundary(self, boundary):
87
        """Define the boundary used in a multi parts message.
88
        
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
89
        The file should be at the beginning of the body, the first range
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
90
        definition is read and taken into account.
91
        """
92
        self._boundary = boundary
93
        # Decode the headers and setup the first range
94
        self.read_boundary()
95
        self.read_range_definition()
96
97
    def read_boundary(self):
98
        """Read the boundary headers defining a new range"""
99
        boundary_line = '\r\n'
100
        while boundary_line == '\r\n':
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
101
            # RFC2616 19.2 Additional CRLFs may precede the first boundary
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
102
            # string entity.
103
            # To be on the safe side we allow it before any boundary line
104
            boundary_line = self._file.readline()
105
        if boundary_line != '--' + self._boundary + '\r\n':
106
            raise errors.InvalidHttpResponse(
107
                self._path,
108
                "Expected a boundary (%s) line, got '%s'" % (self._boundary,
109
                                                             boundary_line))
110
111
    def read_range_definition(self):
112
        """Read a new range definition in a multi parts message.
113
114
        Parse the headers including the empty line following them so that we
115
        are ready to read the data itself.
116
        """
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
117
        self._headers = httplib.HTTPMessage(self._file, seekable=0)
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
118
        # Extract the range definition
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
119
        content_range = self._headers.getheader('content-range', None)
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
120
        if content_range is None:
121
            raise errors.InvalidHttpResponse(
122
                self._path,
123
                'Content-Range header missing in a multi-part response')
124
        self.set_range_from_header(content_range)
125
126
    def set_range_from_header(self, content_range):
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
127
        """Helper to set the new range from its description in the headers"""
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
128
        try:
129
            rtype, values = content_range.split()
3059.2.10 by Vincent Ladeuil
Jam's review feedback.
130
        except ValueError:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
131
            raise errors.InvalidHttpRange(self._path, content_range,
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
132
                                          'Malformed header')
3059.2.11 by Vincent Ladeuil
Fix typos mentioned by spiv.
133
        if rtype != 'bytes':
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
134
            raise errors.InvalidHttpRange(self._path, content_range,
135
                                          "Unsupported range type '%s'" % rtype)
136
        try:
137
            # We don't need total, but note that it may be either the file size
138
            # or '*' if the server can't or doesn't want to return the file
139
            # size.
140
            start_end, total = values.split('/')
141
            start, end = start_end.split('-')
142
            start = int(start)
143
            end = int(end)
3059.2.10 by Vincent Ladeuil
Jam's review feedback.
144
        except ValueError:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
145
            raise errors.InvalidHttpRange(self._path, content_range,
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
146
                                          'Invalid range values')
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
147
        size = end - start + 1
148
        if size <= 0:
149
            raise errors.InvalidHttpRange(self._path, content_range,
3059.2.17 by Vincent Ladeuil
Limit GET requests by body size instead of number of ranges.
150
                                          'Invalid range, size <= 0')
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
151
        self.set_range(start, size)
152
153
    def _checked_read(self, size):
3146.3.2 by Vincent Ladeuil
Fix #179368 by keeping the current range hint on ShortReadvErrors.
154
        """Read the file checking for short reads.
155
156
        The data read is discarded along the way.
157
        """
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
158
        pos = self._pos
3146.3.4 by Vincent Ladeuil
Review feedback, simpler loops.
159
        remaining = size
160
        while remaining > 0:
161
            data = self._file.read(min(remaining, self._discarded_buf_size))
162
            remaining -= len(data)
163
            if not data:
164
                raise errors.ShortReadvError(self._path, pos, size,
165
                                             size - remaining)
166
        self._pos += size
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
167
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
168
    def _seek_to_next_range(self):
169
        # We will cross range boundaries
170
        if self._boundary is None:
171
            # If we don't have a boundary, we can't find another range
3146.3.2 by Vincent Ladeuil
Fix #179368 by keeping the current range hint on ShortReadvErrors.
172
            raise errors.InvalidRange(self._path, self._pos,
173
                                      "Range (%s, %s) exhausted"
174
                                      % (self._start, self._size))
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
175
        self.read_boundary()
176
        self.read_range_definition()
177
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
178
    def read(self, size=-1):
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
179
        """Read size bytes from the current position in the file.
180
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
181
        Reading across ranges is not supported. We rely on the underlying http
182
        client to clean the socket if we leave bytes unread. This may occur for
183
        the final boundary line of a multipart response or for any range
184
        request not entirely consumed by the client (due to offset coalescing)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
185
        """
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
186
        if (self._size > 0
187
            and self._pos == self._start + self._size):
188
            if size == 0:
189
                return ''
190
            else:
191
                self._seek_to_next_range()
192
        elif self._pos < self._start:
193
            raise errors.InvalidRange(
194
                self._path, self._pos,
195
                "Can't read %s bytes before range (%s, %s)"
196
                % (size, self._start, self._size))
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
197
        if self._size > 0:
198
            if size > 0 and self._pos + size > self._start + self._size:
199
                raise errors.InvalidRange(
200
                    self._path, self._pos,
201
                    "Can't read %s bytes across range (%s, %s)"
202
                    % (size, self._start, self._size))
203
204
        if self._size > 0:
205
            # Don't read past the range definition
206
            limited = self._start + self._size - self._pos
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
207
            if size >= 0:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
208
                limited = min(limited, size)
209
            data = self._file.read(limited)
210
        else:
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
211
            # Size of file unknown, the user may have specified a size or not,
212
            # we delegate that to the filesocket object (-1 means read until
213
            # EOF)
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
214
            data = self._file.read(size)
215
        # Update _pos respecting the data effectively read
216
        self._pos += len(data)
217
        return data
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
218
219
    def seek(self, offset, whence=0):
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
220
        start_pos = self._pos
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
221
        if whence == 0:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
222
            final_pos = offset
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
223
        elif whence == 1:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
224
            final_pos = start_pos + offset
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
225
        elif whence == 2:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
226
            if self._size > 0:
3059.2.14 by Vincent Ladeuil
Complete coverage by adding tests for more invalid inputs. Fix a
227
                final_pos = self._start + self._size + offset # offset < 0
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
228
            else:
229
                raise errors.InvalidRange(
230
                    self._path, self._pos,
3059.2.14 by Vincent Ladeuil
Complete coverage by adding tests for more invalid inputs. Fix a
231
                    "RangeFile: can't seek from end while size is unknown")
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
232
        else:
233
            raise ValueError("Invalid value %s for whence." % whence)
234
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
235
        if final_pos < self._pos:
236
            # Can't seek backwards
237
            raise errors.InvalidRange(
238
                self._path, self._pos,
239
                'RangeFile: trying to seek backwards to %s' % final_pos)
240
241
        if self._size > 0:
242
            cur_limit = self._start + self._size
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
243
            while final_pos > cur_limit:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
244
                # We will cross range boundaries
245
                remain = cur_limit - self._pos
246
                if remain > 0:
247
                    # Finish reading the current range
248
                    self._checked_read(remain)
3059.2.18 by Vincent Ladeuil
Take spiv review comments into account.
249
                self._seek_to_next_range()
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
250
                cur_limit = self._start + self._size
251
252
        size = final_pos - self._pos
253
        if size > 0: # size can be < 0 if we crossed a range boundary
254
            # We don't need the data, just read it and throw it away
255
            self._checked_read(size)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
256
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
257
    def tell(self):
258
        return self._pos
259
1786.1.5 by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object.
260
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
261
def handle_response(url, code, msg, data):
262
    """Interpret the code & headers and wrap the provided data in a RangeFile.
263
264
    This is a factory method which returns an appropriate RangeFile based on
265
    the code & headers it's given.
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
266
267
    :param url: The url being processed. Mostly for error reporting
268
    :param code: The integer HTTP response code
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
269
    :param msg: An HTTPMessage containing the headers for the response
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
270
    :param data: A file-like object that can be read() to get the
271
                 requested data
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
272
    :return: A file-like object that can seek()+read() the 
273
             ranges indicated by the headers.
274
    """
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
275
    rfile = RangeFile(url, data)
276
    if code == 200:
277
        # A whole file
278
        size = msg.getheader('content-length', None)
279
        if size is None:
280
            size = -1
281
        else:
282
            size = int(size)
283
        rfile.set_range(0, size)
284
    elif code == 206:
285
        content_type = msg.getheader('content-type', None)
286
        if content_type is None:
287
            # When there is no content-type header we treat the response as
288
            # being of type 'application/octet-stream' as per RFC2616 section
289
            # 7.2.1.
2073.1.1 by John Arbash Meinel
Robert's comments: Refer to RFC2616 to explain how we handle missing Content-Type
290
            # Therefore it is obviously not multipart
291
            content_type = 'application/octet-stream'
2070.1.1 by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses
292
            is_multipart = False
293
        else:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
294
            is_multipart = (msg.getmaintype() == 'multipart'
295
                            and msg.getsubtype() == 'byteranges')
1786.1.26 by John Arbash Meinel
Update and test handle_response.
296
2070.1.1 by John Arbash Meinel
Fix bug #62473 by not requiring content-type in range responses
297
        if is_multipart:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
298
            # Full fledged multipart response
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
299
            rfile.set_boundary(msg.getparam('boundary'))
1786.1.26 by John Arbash Meinel
Update and test handle_response.
300
        else:
301
            # A response to a range request, but not multipart
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
302
            content_range = msg.getheader('content-range', None)
303
            if content_range is None:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
304
                raise errors.InvalidHttpResponse(url,
305
                    'Missing the Content-Range header in a 206 range response')
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
306
            rfile.set_range_from_header(content_range)
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
307
    else:
3059.2.2 by Vincent Ladeuil
Read http responses on demand without buffering the whole body
308
        raise errors.InvalidHttpResponse(url,
309
                                         'Unknown response code %s' % code)
310
311
    return rfile
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
312