170
174
internal represnetation is
171
175
(start, end, count, [1..count tuples (revid, newline)])
174
header = lines.pop(0)
175
start, end, c = [int(n) for n in header.split(',')]
180
# walk through the lines parsing.
182
start, end, count = [int(n) for n in header.split(',')]
178
origin, text = lines.pop(0).split(' ', 1)
186
origin, text = next().split(' ', 1)
179
188
contents.append((origin.decode('utf-8'), text))
180
yield start, end, c, contents
182
def parse_line_delta(self, lines, version):
183
return list(self.parse_line_delta_iter(lines))
189
result.append((start, end, count, contents))
185
192
def lower_fulltext(self, content):
186
193
"""convert a fulltext content record into a serializable form.
1402
1425
Yes, its only 1.6 seconds, but they add up.
1428
def _add_read_data(self, data):
1430
# temp var for len(data) and switch to +='s.
1432
len_data = len(data)
1433
self.crc = zlib.crc32(data, self.crc)
1434
self.extrabuf += data
1435
self.extrasize += len_data
1436
self.size += len_data
1438
def _read(self, size=1024):
1439
# various optimisations:
1440
# reduces lsprof count from 2500 to
1441
# 8337 calls in 1272, 365 internal
1443
if self.fileobj is None:
1444
raise EOFError, "Reached EOF"
1446
if self._new_member:
1447
# If the _new_member flag is set, we have to
1448
# jump to the next member, if there is one.
1450
# First, check if we're at the end of the file;
1451
# if so, it's time to stop; no more members to read.
1452
next_header_bytes = self.fileobj.read(10)
1453
if next_header_bytes == '':
1454
raise EOFError, "Reached EOF"
1457
self._read_gzip_header(next_header_bytes)
1458
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
1459
self._new_member = False
1461
# Read a chunk of data from the file
1462
buf = self.fileobj.read(size)
1464
# If the EOF has been reached, flush the decompression object
1465
# and mark this object as finished.
1468
self._add_read_data(self.decompress.flush())
1469
assert len(self.decompress.unused_data) >= 8, "what does flush do?"
1471
# tell the driving read() call we have stuffed all the data
1473
raise EOFError, 'Reached EOF'
1475
self._add_read_data(self.decompress.decompress(buf))
1477
if self.decompress.unused_data != "":
1478
# Ending case: we've come to the end of a member in the file,
1479
# so seek back to the start of the data for the next member which
1480
# is the length of the decompress objects unused data - the first
1481
# 8 bytes for the end crc and size records.
1483
# so seek back to the start of the unused data, finish up
1484
# this member, and read a new gzip header.
1485
# (The number of bytes to seek back is the length of the unused
1486
# data, minus 8 because those 8 bytes are part of this member.
1487
seek_length = len (self.decompress.unused_data) - 8
1489
assert seek_length > 0
1490
self.fileobj.seek(-seek_length, 1)
1492
# Check the CRC and file size, and set the flag so we read
1493
# a new member on the next call
1495
self._new_member = True
1497
def _read_eof(self):
1498
"""tuned to reduce function calls and eliminate file seeking:
1500
reduces lsprof count from 800 to 288
1502
avoid U32 call by using struct format L
1505
# We've read to the end of the file, so we should have 8 bytes of
1506
# unused data in the decompressor. If we dont, there is a corrupt file.
1507
# We use these 8 bytes to calculate the CRC and the recorded file size.
1508
# We then check the that the computed CRC and size of the
1509
# uncompressed data matches the stored values. Note that the size
1510
# stored is the true file size mod 2**32.
1511
crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])
1512
# note that isize is unsigned - it can exceed 2GB
1513
if crc32 != U32(self.crc):
1514
raise IOError, "CRC check failed"
1515
elif isize != LOWU32(self.size):
1516
raise IOError, "Incorrect length of data produced"
1518
def _read_gzip_header(self, bytes=None):
1519
"""Supply bytes if the minimum header size is already read.
1521
:param bytes: 10 bytes of header data.
1523
"""starting cost: 300 in 3998
1524
15998 reads from 3998 calls
1528
bytes = self.fileobj.read(10)
1530
if magic != '\037\213':
1531
raise IOError, 'Not a gzipped file'
1532
method = ord(bytes[2:3])
1534
raise IOError, 'Unknown compression method'
1535
flag = ord(bytes[3:4])
1536
# modtime = self.fileobj.read(4) (bytes [4:8])
1537
# extraflag = self.fileobj.read(1) (bytes[8:9])
1538
# os = self.fileobj.read(1) (bytes[9:10])
1539
# self.fileobj.read(6)
1542
# Read & discard the extra field, if present
1543
xlen = ord(self.fileobj.read(1))
1544
xlen = xlen + 256*ord(self.fileobj.read(1))
1545
self.fileobj.read(xlen)
1547
# Read and discard a null-terminated string containing the filename
1549
s = self.fileobj.read(1)
1550
if not s or s=='\000':
1553
# Read and discard a null-terminated string containing a comment
1555
s = self.fileobj.read(1)
1556
if not s or s=='\000':
1559
self.fileobj.read(2) # Read & discard the 16-bit header CRC
1561
def readline(self, size=-1):
1562
"""Tuned to remove buffer length calls in _unread and...
1564
also removes multiple len(c) calls, inlines _unread,
1565
total savings - lsprof 5800 to 5300
1568
8176 calls to read() in 1684
1569
changing the min chunk size to 200 halved all the cache misses
1570
leading to a drop to:
1572
4168 call to read() in 1646
1573
- i.e. just reduced the function call overhead. May be worth
1576
if size < 0: size = sys.maxint
1578
readsize = min(200, size) # Read from the file in small chunks
1581
return "".join(bufs) # Return resulting line
1584
c = self.read(readsize)
1585
# number of bytes read
1588
if size is not None:
1589
# We set i=size to break out of the loop under two
1590
# conditions: 1) there's no newline, and the chunk is
1591
# larger than size, or 2) there is a newline, but the
1592
# resulting line would be longer than 'size'.
1593
if i==-1 and len_c > size: i=size-1
1594
elif size <= i: i = size -1
1596
if i >= 0 or c == '':
1597
# if i>= 0 we have a newline or have triggered the above
1598
# if size is not None condition.
1599
# if c == '' its EOF.
1600
bufs.append(c[:i+1]) # Add portion of last chunk
1601
# -- inlined self._unread --
1602
## self._unread(c[i+1:], len_c - i) # Push back rest of chunk
1603
self.extrabuf = c[i+1:] + self.extrabuf
1604
self.extrasize = len_c - i + self.extrasize
1605
self.offset -= len_c - i
1606
# -- end inlined self._unread --
1607
return ''.join(bufs) # Return resulting line
1609
# Append chunk to list, decrease 'size',
1612
readsize = min(size, readsize * 2)
1614
def readlines(self, sizehint=0):
1615
# optimise to avoid all the buffer manipulation
1616
# lsprof changed from:
1617
# 4168 calls in 5472 with 32000 calls to readline()
1619
# 4168 calls in 417.
1620
# Negative numbers result in reading all the lines
1623
content = self.read(sizehint)
1624
return content.splitlines(True)
1626
def _unread(self, buf, len_buf=None):
1627
"""tuned to remove unneeded len calls.
1629
because this is such an inner routine in readline, and readline is
1630
in many inner loops, this has been inlined into readline().
1632
The len_buf parameter combined with the reduction in len calls dropped
1633
the lsprof ms count for this routine on my test data from 800 to 200 -
1638
self.extrabuf = buf + self.extrabuf
1639
self.extrasize = len_buf + self.extrasize
1640
self.offset -= len_buf
1405
1642
def write(self, data):
1406
1643
if self.mode != gzip.WRITE: