/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
1
# Copyright (C) 2006 Michael Ellerman
1786.1.33 by John Arbash Meinel
Cleanup pass #2
2
#           modified by John Arbash Meinel (Canonical Ltd)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
18
"""Handlers for HTTP Responses.
19
20
The purpose of these classes is to provide a uniform interface for clients
21
to standard HTTP responses, single range responses and multipart range
22
responses.
23
"""
24
25
26
from bisect import bisect
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
27
from cStringIO import StringIO
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
28
import re
29
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
30
from bzrlib import errors
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
31
from bzrlib.trace import mutter
32
33
34
class ResponseRange(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
35
    """A range in a RangeFile-object."""
36
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
37
    __slots__ = ['_ent_start', '_ent_end', '_data_start']
38
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
39
    def __init__(self, ent_start, ent_end, data_start):
40
        self._ent_start = ent_start
41
        self._ent_end = ent_end
42
        self._data_start = data_start
43
44
    def __cmp__(self, other):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
45
        """Compare this to other.
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
46
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
47
        We need this both for sorting, and so that we can
48
        bisect the list of ranges.
49
        """
50
        if isinstance(other, int):
51
            # Later on we bisect for a starting point
52
            # so we allow comparing against a single integer
53
            return cmp(self._ent_start, other)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
54
        else:
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
55
            return cmp((self._ent_start, self._ent_end, self._data_start),
56
                       (other._ent_start, other._ent_end, other._data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
57
58
    def __str__(self):
1786.1.12 by John Arbash Meinel
Add tests for ResponseRange and streamline class
59
        return "%s(%s-%s,%s)" % (self.__class__.__name__,
60
                                 self._ent_start, self._ent_end,
61
                                 self._data_start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
62
63
64
class RangeFile(object):
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
65
    """File-like object that allow access to partial available data.
66
67
    Specified by a set of ranges.
68
    """
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
69
70
    def __init__(self, path, input_file):
71
        self._path = path
72
        self._pos = 0
73
        self._len = 0
74
        self._ranges = []
75
        self._data = input_file.read()
76
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
77
    def _add_range(self, ent_start, ent_end, data_start):
78
        """Add an entity range.
79
80
        :param ent_start: Start offset of entity
81
        :param ent_end: End offset of entity (inclusive)
82
        :param data_start: Start offset of data in data stream.
83
        """
84
        self._ranges.append(ResponseRange(ent_start, ent_end, data_start))
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
85
        self._len = max(self._len, ent_end)
86
87
    def _finish_ranges(self):
88
        self._ranges.sort()
89
90
    def read(self, size):
91
        """Read size bytes from the current position in the file.
92
93
        Reading across ranges is not supported.
94
        """
95
        # find the last range which has a start <= pos
96
        i = bisect(self._ranges, self._pos) - 1
97
98
        if i < 0 or self._pos > self._ranges[i]._ent_end:
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
99
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
100
101
        r = self._ranges[i]
102
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
103
        # mutter('found range %s %s for pos %s', i, self._ranges[i], self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
104
105
        if (self._pos + size - 1) > r._ent_end:
1786.1.13 by John Arbash Meinel
Found a few bugs in error handling code, updated tests
106
            raise errors.InvalidRange(self._path, self._pos)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
107
108
        start = r._data_start + (self._pos - r._ent_start)
109
        end   = start + size
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
110
        # mutter("range read %d bytes at %d == %d-%d", size, self._pos,
111
        #         start, end)
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
112
        self._pos += (end-start)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
113
        return self._data[start:end]
114
115
    def seek(self, offset, whence=0):
116
        if whence == 0:
117
            self._pos = offset
118
        elif whence == 1:
119
            self._pos += offset
120
        elif whence == 2:
121
            self._pos = self._len + offset
122
        else:
123
            raise ValueError("Invalid value %s for whence." % whence)
124
125
        if self._pos < 0:
126
            self._pos = 0
127
1786.1.8 by John Arbash Meinel
[merge] Johan Rydberg test updates
128
    def tell(self):
129
        return self._pos
130
1786.1.5 by John Arbash Meinel
Move the common Multipart stuff into plain http, and wrap pycurl response so that it matches the urllib response object.
131
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
132
class HttpRangeResponse(RangeFile):
133
    """A single-range HTTP response."""
134
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
135
    # TODO: jam 20060706 Consider compiling these regexes on demand
136
    _CONTENT_RANGE_RE = re.compile(
137
        '\s*([^\s]+)\s+([0-9]+)-([0-9]+)/([0-9]+)\s*$')
138
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
139
    def __init__(self, path, content_range, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
140
        # mutter("parsing 206 non-multipart response for %s", path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
141
        RangeFile.__init__(self, path, input_file)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
142
        start, end = self._parse_range(content_range, path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
143
        self._add_range(start, end, 0)
144
        self._finish_ranges()
145
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
146
    @staticmethod
147
    def _parse_range(range, path='<unknown>'):
148
        """Parse an http Content-range header and return start + end
149
150
        :param range: The value for Content-range
151
        :param path: Provide to give better error messages.
152
        :return: (start, end) A tuple of integers
153
        """
154
        match = HttpRangeResponse._CONTENT_RANGE_RE.match(range)
155
        if not match:
156
            raise errors.InvalidHttpRange(path, range,
157
                                          "Invalid Content-range")
158
159
        rtype, start, end, total = match.groups()
160
161
        if rtype != 'bytes':
162
            raise errors.InvalidHttpRange(path, range,
163
                    "Unsupported range type '%s'" % (rtype,))
164
165
        try:
166
            start = int(start)
167
            end = int(end)
168
        except ValueError, e:
169
            raise errors.InvalidHttpRange(path, range, str(e))
170
171
        return start, end
1786.1.16 by John Arbash Meinel
Refactor tests
172
173
174
class HttpMultipartRangeResponse(RangeFile):
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
175
    """A multi-range HTTP response."""
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
176
    
177
    _CONTENT_TYPE_RE = re.compile(
178
        '^\s*multipart/byteranges\s*;\s*boundary\s*=\s*(.*?)\s*$')
179
    
180
    # Start with --<boundary>\r\n
181
    # and ignore all headers ending in \r\n
182
    # except for content-range:
183
    # and find the two trailing \r\n separators
184
    # indicating the start of the text
185
    # TODO: jam 20060706 This requires exact conformance
186
    #       to the spec, we probably could relax the requirement
187
    #       of \r\n, and use something more like (\r?\n)
188
    _BOUNDARY_PATT = (
189
        "^--%s(?:\r\n(?:(?:content-range:([^\r]+))|[^\r]+))+\r\n\r\n")
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
190
191
    def __init__(self, path, content_type, input_file):
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
192
        # mutter("parsing 206 multipart response for %s", path)
1786.1.14 by John Arbash Meinel
Testing basic functionality of HttpMultipartRangeResponse
193
        # TODO: jam 20060706 Is it valid to initialize a
194
        #       grandparent without initializing parent?
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
195
        RangeFile.__init__(self, path, input_file)
196
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
197
        self.boundary_regex = self._parse_boundary(content_type, path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
198
1786.1.18 by John Arbash Meinel
Add tests for HttpMultiRangeResponse
199
        for match in self.boundary_regex.finditer(self._data):
1786.1.33 by John Arbash Meinel
Cleanup pass #2
200
            ent_start, ent_end = HttpRangeResponse._parse_range(match.group(1),
201
                                                                path)
1750.1.2 by Michael Ellerman
Add support for HTTP multipart ranges and hook it into http+urllib.
202
            self._add_range(ent_start, ent_end, match.end())
203
204
        self._finish_ranges()
205
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
206
    @staticmethod
207
    def _parse_boundary(ctype, path='<unknown>'):
208
        """Parse the Content-type field.
209
        
210
        This expects a multipart Content-type, and returns a
211
        regex which is capable of finding the boundaries
212
        in the multipart data.
213
        """
214
        match = HttpMultipartRangeResponse._CONTENT_TYPE_RE.match(ctype)
215
        if not match:
216
            raise errors.InvalidHttpContentType(path, ctype,
217
                    "Expected multipart/byteranges with boundary")
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
218
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
219
        boundary = match.group(1)
1786.1.32 by John Arbash Meinel
cleanup pass, allow pycurl connections to be shared between transports.
220
        # mutter('multipart boundary is %s', boundary)
1786.1.24 by John Arbash Meinel
Move the functions/regexes to be static members
221
        pattern = HttpMultipartRangeResponse._BOUNDARY_PATT
222
        return re.compile(pattern % re.escape(boundary),
223
                          re.IGNORECASE | re.MULTILINE)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
224
225
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
226
def _is_multipart(content_type):
227
    return content_type.startswith('multipart/byteranges;')
228
229
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
230
def handle_response(url, code, headers, data):
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
231
    """Interpret the code & headers and return a HTTP response.
232
233
    This is a factory method which returns an appropriate HTTP response
234
    based on the code & headers it's given.
235
236
    :param url: The url being processed. Mostly for error reporting
237
    :param code: The integer HTTP response code
238
    :param headers: A dict-like object that contains the HTTP response headers
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
239
    :param data: A file-like object that can be read() to get the
240
                 requested data
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
241
    :return: A file-like object that can seek()+read() the 
242
             ranges indicated by the headers.
243
    """
1786.1.26 by John Arbash Meinel
Update and test handle_response.
244
245
    if code == 206:
246
        try:
247
            content_type = headers['Content-Type']
248
        except KeyError:
249
            raise errors.InvalidHttpContentType(url, '',
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
250
                msg='Missing Content-Type')
1786.1.26 by John Arbash Meinel
Update and test handle_response.
251
252
        if _is_multipart(content_type):
253
            # Full fledged multipart response
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
254
            return HttpMultipartRangeResponse(url, content_type, data)
1786.1.26 by John Arbash Meinel
Update and test handle_response.
255
        else:
256
            # A response to a range request, but not multipart
257
            try:
258
                content_range = headers['Content-Range']
259
            except KeyError:
260
                raise errors.InvalidHttpResponse(url,
261
                    'Missing the Content-Range header in a 206 range response')
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
262
            return HttpRangeResponse(url, content_range, data)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
263
    elif code == 200:
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
264
        # A regular non-range response, unfortunately the result from
265
        # urllib doesn't support seek, so we wrap it in a StringIO
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
266
        tell = getattr(data, 'tell', None)
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
267
        if tell is None:
1786.1.27 by John Arbash Meinel
Fix up the http transports so that tests pass with the new configuration.
268
            return StringIO(data.read())
269
        return data
1786.1.25 by John Arbash Meinel
Test that we can extract headers properly.
270
    elif code == 404:
1786.1.26 by John Arbash Meinel
Update and test handle_response.
271
        raise errors.NoSuchFile(url)
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
272
1786.1.33 by John Arbash Meinel
Cleanup pass #2
273
    # TODO: jam 20060713 Properly handle redirects (302 Found, etc)
274
    #       The '_get' code says to follow redirects, we probably 
275
    #       should actually handle the return values
1786.1.40 by John Arbash Meinel
code cleanups from Martin Pool.
276
    else:
277
        raise errors.InvalidHttpResponse(url, "Unknown response code %s" 
278
                                              % (code,))
1786.1.21 by John Arbash Meinel
(broken) Work on factoring out handle_response so we can test with fake headers.
279