1
# Copyright (C) 2007 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Container format for Bazaar data.
19
"Containers" and "records" are described in doc/developers/container-format.txt.
22
from cStringIO import StringIO
25
from bzrlib import errors
28
FORMAT_ONE = "Bazaar pack format 1 (introduced in 0.18)"
31
_whitespace_re = re.compile('[\t\n\x0b\x0c\r ]')
34
def _check_name(name):
35
"""Do some basic checking of 'name'.
37
At the moment, this just checks that there are no whitespace characters in a
40
:raises InvalidRecordError: if name is not valid.
41
:seealso: _check_name_encoding
43
if _whitespace_re.search(name) is not None:
44
raise errors.InvalidRecordError("%r is not a valid name." % (name,))
47
def _check_name_encoding(name):
48
"""Check that 'name' is valid UTF-8.
50
This is separate from _check_name because UTF-8 decoding is relatively
51
expensive, and we usually want to avoid it.
53
:raises InvalidRecordError: if name is not valid UTF-8.
57
except UnicodeDecodeError, e:
58
raise errors.InvalidRecordError(str(e))
61
class ContainerSerialiser(object):
62
"""A helper class for serialising containers.
64
It simply returns bytes from method calls to 'begin', 'end' and
65
'bytes_record'. You may find ContainerWriter to be a more convenient
70
"""Return the bytes to begin a container."""
71
return FORMAT_ONE + "\n"
74
"""Return the bytes to finish a container."""
77
def bytes_record(self, bytes, names):
78
"""Return the bytes for a Bytes record with the given name and
84
byte_sections.append(str(len(bytes)) + "\n")
86
for name_tuple in names:
87
# Make sure we're writing valid names. Note that we will leave a
88
# half-written record if a name is bad!
89
for name in name_tuple:
91
byte_sections.append('\x00'.join(name_tuple) + "\n")
93
byte_sections.append("\n")
94
# Finally, the contents.
95
byte_sections.append(bytes)
96
# XXX: This causes a memory copy of bytes in size, but is usually
97
# faster than two write calls (12 vs 13 seconds to output a gig of
98
# 1k records.) - results may differ on significantly larger records
99
# like .iso's but as they should be rare in any case and thus not
100
# likely to be the common case. The biggest issue is causing extreme
101
# memory pressure in that case. One possibly improvement here is to
102
# check the size of the content before deciding to join here vs call
104
return ''.join(byte_sections)
107
class ContainerWriter(object):
108
"""A class for writing containers to a file.
110
:attribute records_written: The number of user records added to the
111
container. This does not count the prelude or suffix of the container
112
introduced by the begin() and end() methods.
115
def __init__(self, write_func):
118
:param write_func: a callable that will be called when this
119
ContainerWriter needs to write some bytes.
121
self._write_func = write_func
122
self.current_offset = 0
123
self.records_written = 0
124
self._serialiser = ContainerSerialiser()
127
"""Begin writing a container."""
128
self.write_func(self._serialiser.begin())
130
def write_func(self, bytes):
131
self._write_func(bytes)
132
self.current_offset += len(bytes)
135
"""Finish writing a container."""
136
self.write_func(self._serialiser.end())
138
def add_bytes_record(self, bytes, names):
139
"""Add a Bytes record with the given names.
141
:param bytes: The bytes to insert.
142
:param names: The names to give the inserted bytes. Each name is
143
a tuple of bytestrings. The bytestrings may not contain
145
:return: An offset, length tuple. The offset is the offset
146
of the record within the container, and the length is the
147
length of data that will need to be read to reconstitute the
148
record. These offset and length can only be used with the pack
149
interface - they might be offset by headers or other such details
150
and thus are only suitable for use by a ContainerReader.
152
current_offset = self.current_offset
153
serialised_record = self._serialiser.bytes_record(bytes, names)
154
self.write_func(serialised_record)
155
self.records_written += 1
156
# return a memo of where we wrote data to allow random access.
157
return current_offset, self.current_offset - current_offset
160
class ReadVFile(object):
161
"""Adapt a readv result iterator to a file like protocol."""
163
def __init__(self, readv_result):
164
self.readv_result = readv_result
165
# the most recent readv result block
169
if (self._string is None or
170
self._string.tell() == self._string_length):
171
length, data = self.readv_result.next()
172
self._string_length = len(data)
173
self._string = StringIO(data)
175
def read(self, length):
177
result = self._string.read(length)
178
if len(result) < length:
179
raise errors.BzrError('request for too much data from a readv hunk.')
183
"""Note that readline will not cross readv segments."""
185
result = self._string.readline()
186
if self._string.tell() == self._string_length and result[-1] != '\n':
187
raise errors.BzrError('short readline in the readvfile hunk.')
191
def make_readv_reader(transport, filename, requested_records):
192
"""Create a ContainerReader that will read selected records only.
194
:param transport: The transport the pack file is located on.
195
:param filename: The filename of the pack file.
196
:param requested_records: The record offset, length tuples as returned
197
by add_bytes_record for the desired records.
199
readv_blocks = [(0, len(FORMAT_ONE)+1)]
200
readv_blocks.extend(requested_records)
201
result = ContainerReader(ReadVFile(
202
transport.readv(filename, readv_blocks)))
206
class BaseReader(object):
208
def __init__(self, source_file):
211
:param source_file: a file-like object with `read` and `readline`
214
self._source = source_file
216
def reader_func(self, length=None):
217
return self._source.read(length)
219
def _read_line(self):
220
line = self._source.readline()
221
if not line.endswith('\n'):
222
raise errors.UnexpectedEndOfContainerError()
223
return line.rstrip('\n')
226
class ContainerReader(BaseReader):
227
"""A class for reading Bazaar's container format."""
229
def iter_records(self):
230
"""Iterate over the container, yielding each record as it is read.
232
Each yielded record will be a 2-tuple of (names, callable), where names
233
is a ``list`` and bytes is a function that takes one argument,
236
You **must not** call the callable after advancing the interator to the
237
next record. That is, this code is invalid::
239
record_iter = container.iter_records()
240
names1, callable1 = record_iter.next()
241
names2, callable2 = record_iter.next()
242
bytes1 = callable1(None)
244
As it will give incorrect results and invalidate the state of the
247
:raises ContainerError: if any sort of containter corruption is
248
detected, e.g. UnknownContainerFormatError is the format of the
249
container is unrecognised.
250
:seealso: ContainerReader.read
253
return self._iter_records()
255
def iter_record_objects(self):
256
"""Iterate over the container, yielding each record as it is read.
258
Each yielded record will be an object with ``read`` and ``validate``
259
methods. Like with iter_records, it is not safe to use a record object
260
after advancing the iterator to yield next record.
262
:raises ContainerError: if any sort of containter corruption is
263
detected, e.g. UnknownContainerFormatError is the format of the
264
container is unrecognised.
265
:seealso: iter_records
268
return self._iter_record_objects()
270
def _iter_records(self):
271
for record in self._iter_record_objects():
274
def _iter_record_objects(self):
276
record_kind = self.reader_func(1)
277
if record_kind == 'B':
279
reader = BytesRecordReader(self._source)
281
elif record_kind == 'E':
282
# End marker. There are no more records.
284
elif record_kind == '':
285
# End of stream encountered, but no End Marker record seen, so
286
# this container is incomplete.
287
raise errors.UnexpectedEndOfContainerError()
289
# Unknown record type.
290
raise errors.UnknownRecordTypeError(record_kind)
292
def _read_format(self):
293
format = self._read_line()
294
if format != FORMAT_ONE:
295
raise errors.UnknownContainerFormatError(format)
298
"""Validate this container and its records.
300
Validating consumes the data stream just like iter_records and
301
iter_record_objects, so you cannot call it after
302
iter_records/iter_record_objects.
304
:raises ContainerError: if something is invalid.
307
for record_names, read_bytes in self.iter_records():
309
for name_tuple in record_names:
310
for name in name_tuple:
311
_check_name_encoding(name)
312
# Check that the name is unique. Note that Python will refuse
313
# to decode non-shortest forms of UTF-8 encoding, so there is no
314
# risk that the same unicode string has been encoded two
316
if name_tuple in all_names:
317
raise errors.DuplicateRecordNameError(name_tuple)
318
all_names.add(name_tuple)
319
excess_bytes = self.reader_func(1)
320
if excess_bytes != '':
321
raise errors.ContainerHasExcessDataError(excess_bytes)
324
class BytesRecordReader(BaseReader):
329
You can either validate or read a record, you can't do both.
331
:returns: A tuple of (names, callable). The callable can be called
332
repeatedly to obtain the bytes for the record, with a max_length
333
argument. If max_length is None, returns all the bytes. Because
334
records can be arbitrarily large, using None is not recommended
335
unless you have reason to believe the content will fit in memory.
337
# Read the content length.
338
length_line = self._read_line()
340
length = int(length_line)
342
raise errors.InvalidRecordError(
343
"%r is not a valid length." % (length_line,))
345
# Read the list of names.
348
name_line = self._read_line()
351
name_tuple = tuple(name_line.split('\x00'))
352
for name in name_tuple:
354
names.append(name_tuple)
356
self._remaining_length = length
357
return names, self._content_reader
359
def _content_reader(self, max_length):
360
if max_length is None:
361
length_to_read = self._remaining_length
363
length_to_read = min(max_length, self._remaining_length)
364
self._remaining_length -= length_to_read
365
bytes = self.reader_func(length_to_read)
366
if len(bytes) != length_to_read:
367
raise errors.UnexpectedEndOfContainerError()
371
"""Validate this record.
373
You can either validate or read, you can't do both.
375
:raises ContainerError: if this record is invalid.
377
names, read_bytes = self.read()
378
for name_tuple in names:
379
for name in name_tuple:
380
_check_name_encoding(name)
384
class ContainerPushParser(object):
388
self._state_handler = self._state_expecting_format_line
389
self._parsed_records = []
390
self._reset_current_record()
392
def _reset_current_record(self):
393
self._current_record_length = None
394
self._current_record_names = []
396
def accept_bytes(self, bytes):
397
self._buffer += bytes
398
# Keep iterating the state machine until it stops consuming bytes from
400
last_buffer_length = None
401
cur_buffer_length = len(self._buffer)
402
while cur_buffer_length != last_buffer_length:
403
last_buffer_length = cur_buffer_length
404
self._state_handler()
405
cur_buffer_length = len(self._buffer)
407
def read_pending_records(self):
408
records = self._parsed_records
409
self._parsed_records = []
412
def _consume_line(self):
413
"""Take a line out of the buffer, and return the line.
415
If a newline byte is not found in the buffer, the buffer is
416
unchanged and this returns None instead.
418
newline_pos = self._buffer.find('\n')
419
if newline_pos != -1:
420
line = self._buffer[:newline_pos]
421
self._buffer = self._buffer[newline_pos+1:]
426
def _state_expecting_format_line(self):
427
line = self._consume_line()
429
if line != FORMAT_ONE:
430
raise errors.UnknownContainerFormatError(line)
431
self._state_handler = self._state_expecting_record_type
433
def _state_expecting_record_type(self):
434
if len(self._buffer) >= 1:
435
record_type = self._buffer[0]
436
self._buffer = self._buffer[1:]
437
if record_type == 'B':
438
self._state_handler = self._state_expecting_length
439
elif record_type == 'E':
440
self._state_handler = self._state_expecting_nothing
442
raise errors.UnknownRecordTypeError(record_type)
444
def _state_expecting_length(self):
445
line = self._consume_line()
448
self._current_record_length = int(line)
450
raise errors.InvalidRecordError(
451
"%r is not a valid length." % (line,))
452
self._state_handler = self._state_expecting_name
454
def _state_expecting_name(self):
455
encoded_name_parts = self._consume_line()
456
if encoded_name_parts == '':
457
self._state_handler = self._state_expecting_body
458
elif encoded_name_parts:
459
name_parts = tuple(encoded_name_parts.split('\x00'))
460
for name_part in name_parts:
461
_check_name(name_part)
462
self._current_record_names.append(name_parts)
464
def _state_expecting_body(self):
465
if len(self._buffer) >= self._current_record_length:
466
body_bytes = self._buffer[:self._current_record_length]
467
self._buffer = self._buffer[self._current_record_length:]
468
record = (self._current_record_names, body_bytes)
469
self._parsed_records.append(record)
470
self._reset_current_record()
471
self._state_handler = self._state_expecting_record_type
473
def _state_expecting_nothing(self):
476
def bytes_to_read(self):
477
newline_terminated_states = [
478
self._state_expecting_name,
479
self._state_expecting_length,
480
self._state_expecting_format_line,
482
if self._state_handler in newline_terminated_states:
484
elif self._state_handler == self._state_expecting_record_type:
486
elif self._state_handler == self._state_expecting_body:
487
remaining = self._current_record_length - len(self._buffer)
490
return 'bytes', remaining
491
elif self._state_handler == self._state_expecting_nothing:
494
raise AssertionError, (
495
'Unknown ContainerPushParser state %r' % self._state_handler)
498
def iter_records_from_file(source_file):
499
parser = ContainerPushParser()
501
read_what, read_how_much = parser.bytes_to_read()
502
if read_what == 'line':
503
bytes = source_file.readline()
504
elif read_what == 'bytes':
505
read_how_much = max(read_how_much, 16384)
506
bytes = source_file.read(read_how_much)
507
elif read_what == 'end':
510
raise AssertionError, "Bad bytes_to_read: %r" % (bytes_to_read,)
511
parser.accept_bytes(bytes)
512
for record in parser.read_pending_records():