/brz/remove-bazaar : revision 1628.1.2

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/knit.py

Committer: Robert Collins
Date: 2006-03-31 00:47:11 UTC
mto: (1641.1.1 integration)
mto: This revision was merged to the branch mainline in revision 1642.
Revision ID: robertc@robertcollins.net-20060331004711-8f55b5209a7b4dbe

More knit micro-optimisations.

files modified:
bzrlib/knit.py

bzrlib/tests/test_knit.py

Show diffs side-by-side

added added

removed removed

bzrlib/knit.py

import gzip

from itertools import izip, chain

import os

import sys

import bzrlib

import bzrlib.errors as errors

161

return KnitContent(lines)

162

163

def parse_line_delta_iter(self, lines):

164

for result_item in self.parse_line_delta[lines]:

165

yield result_item

166

167

def parse_line_delta(self, lines, version):

164

168

"""Convert a line based delta into internal representation.

165

169

166

170

line delta is in the form of:

170

174

internal represnetation is

171

175

(start, end, count, [1..count tuples (revid, newline)])

172

176

"""

173

while lines:

174

header = lines.pop(0)

175

start, end, c = [int(n) for n in header.split(',')]

177

result = []

178

lines = iter(lines)

179

next = lines.next

180

# walk through the lines parsing.

181

for header in lines:

182

start, end, count = [int(n) for n in header.split(',')]

176

183

contents = []

177

for i in range(c):

178

origin, text = lines.pop(0).split(' ', 1)

184

remaining = count

185

while remaining:

186

origin, text = next().split(' ', 1)

187

remaining -= 1

179

188

contents.append((origin.decode('utf-8'), text))

180

yield start, end, c, contents

181

182

def parse_line_delta(self, lines, version):

183

return list(self.parse_line_delta_iter(lines))

189

result.append((start, end, count, contents))

190

return result

184

191

185

192

def lower_fulltext(self, content):

186

193

"""convert a fulltext content record into a serializable form.

192

199

def lower_line_delta(self, delta):

193

200

"""convert a delta into a serializable form.

194

201

195

See parse_line_delta_iter which this inverts.

202

See parse_line_delta which this inverts.

196

203

"""

197

204

out = []

198

205

for start, end, c, lines in delta:

487

494

The basis knit will be used to the largest extent possible

488

495

since it is assumed that accesses to it is faster.

489

496

"""

497

#profile notes:

498

# 4168 calls in 14912, 2289 internal

499

# 4168 in 9711 to read_records

500

# 52554 in 1250 to get_parents

501

# 170166 in 865 to list.append

502

490

503

# needed_revisions holds a list of (method, version_id) of

491

504

# versions that is needed to be fetched to construct the final

492

505

# version of the file.

749

762

750

763

def get_parents(self, version_id):

751

764

"""See VersionedFile.get_parents."""

752

self._check_versions_present([version_id])

753

return list(self._index.get_parents(version_id))

765

# perf notes:

766

# optimism counts!

767

# 52554 calls in 1264 872 internal down from 3674

768

try:

769

return self._index.get_parents(version_id)

770

except KeyError:

771

raise RevisionNotPresent(version_id, self.filename)

754

772

755

773

def get_parents_with_ghosts(self, version_id):

756

774

"""See VersionedFile.get_parents."""

757

self._check_versions_present([version_id])

758

return list(self._index.get_parents_with_ghosts(version_id))

775

try:

776

return self._index.get_parents_with_ghosts(version_id)

777

except KeyError:

778

raise RevisionNotPresent(version_id, self.filename)

759

779

760

780

def get_ancestry(self, versions):

761

781

"""See VersionedFile.get_ancestry."""

1126

1146

len(lines),

1127

1147

digest)],

1128

1148

lines,

1129

["end %s\n\n" % version_id.encode('utf-8')]))

1149

["end %s\n" % version_id.encode('utf-8')]))

1130

1150

data_file.close()

1131

1151

length= sio.tell()

1132

1152

1166

1186

return df, rec

1167

1187

1168

1188

def _parse_record(self, version_id, data):

1189

# profiling notes:

1190

# 4168 calls in 2880 217 internal

1191

# 4168 calls to _parse_record_header in 2121

1192

# 4168 calls to readlines in 330

1169

1193

df, rec = self._parse_record_header(version_id, data)

1170

lines = int(rec[2])

1171

record_contents = self._read_record_contents(df, lines)

1172

l = df.readline()

1194

record_contents = df.readlines()

1195

l = record_contents.pop()

1196

assert len(record_contents) == int(rec[2])

1173

1197

if l.decode('utf-8') != 'end %s\n' % version_id:

1174

1198

raise KnitCorrupt(self._filename, 'unexpected version end line %r, wanted %r'

1175

1199

% (l, version_id))

1176

1200

df.close()

1177

1201

return record_contents, rec[3]

1178

1202

1179

def _read_record_contents(self, df, record_lines):

1180

"""Read and return n lines from datafile."""

1181

r = []

1182

for i in range(record_lines):

1183

r.append(df.readline())

1184

return r

1185

1186

1203

def read_records_iter_raw(self, records):

1187

1204

"""Read text records from data file and yield raw data.

1188

1205

1226

1243

will be read in the given order. Yields (version_id,

1227

1244

contents, digest).

1228

1245

"""

1246

# profiling notes:

1247

# 60890 calls for 4168 extractions in 5045, 683 internal.

1248

# 4168 calls to readv in 1411

1249

# 4168 calls to parse_record in 2880

1229

1250

1230

1251

needed_records = []

1231

1252

for version_id, pos, size in records:

1243

1264

self._records[record_id] = (digest, content)

1244

1265

1245

1266

for version_id, pos, size in records:

1246

yield version_id, copy(self._records[version_id][1]), copy(self._records[version_id][0])

1267

yield version_id, list(self._records[version_id][1]), self._records[version_id][0]

1247

1268

1248

1269

def read_records(self, records):

1249

1270

"""Read records into a dictionary."""

1378

1399

1379

1400

1380

1401

# make GzipFile faster:

1402

import struct

1381

1403

import zlib

1404

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

1382

1405

class GzipFile(gzip.GzipFile):

1383

1406

"""Knit tuned version of GzipFile.

1384

1407

1402

1425

Yes, its only 1.6 seconds, but they add up.

1403

1426

"""

1404

1427

1428

def _add_read_data(self, data):

1429

# 4169 calls in 183

1430

# temp var for len(data) and switch to +='s.

1431

# 4169 in 139

1432

len_data = len(data)

1433

self.crc = zlib.crc32(data, self.crc)

1434

self.extrabuf += data

1435

self.extrasize += len_data

1436

self.size += len_data

1437

1438

def _read(self, size=1024):

1439

# various optimisations:

1440

# reduces lsprof count from 2500 to

1441

# 8337 calls in 1272, 365 internal

1442

"""

1443

if self.fileobj is None:

1444

raise EOFError, "Reached EOF"

1445

1446

if self._new_member:

1447

# If the _new_member flag is set, we have to

1448

# jump to the next member, if there is one.

1449

1450

# First, check if we're at the end of the file;

1451

# if so, it's time to stop; no more members to read.

1452

next_header_bytes = self.fileobj.read(10)

1453

if next_header_bytes == '':

1454

raise EOFError, "Reached EOF"

1455

1456

self._init_read()

1457

self._read_gzip_header(next_header_bytes)

1458

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

1459

self._new_member = False

1460

1461

# Read a chunk of data from the file

1462

buf = self.fileobj.read(size)

1463

1464

# If the EOF has been reached, flush the decompression object

1465

# and mark this object as finished.

1466

1467

if buf == "":

1468

self._add_read_data(self.decompress.flush())

1469

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

1470

self._read_eof()

1471

# tell the driving read() call we have stuffed all the data

1472

# in self.extrabuf

1473

raise EOFError, 'Reached EOF'

1474

1475

self._add_read_data(self.decompress.decompress(buf))

1476

1477

if self.decompress.unused_data != "":

1478

# Ending case: we've come to the end of a member in the file,

1479

# so seek back to the start of the data for the next member which

1480

# is the length of the decompress objects unused data - the first

1481

# 8 bytes for the end crc and size records.

1482

1483

# so seek back to the start of the unused data, finish up

1484

# this member, and read a new gzip header.

1485

# (The number of bytes to seek back is the length of the unused

1486

# data, minus 8 because those 8 bytes are part of this member.

1487

seek_length = len (self.decompress.unused_data) - 8

1488

if seek_length:

1489

assert seek_length > 0

1490

self.fileobj.seek(-seek_length, 1)

1491

1492

# Check the CRC and file size, and set the flag so we read

1493

# a new member on the next call

1494

self._read_eof()

1495

self._new_member = True

1496

1497

def _read_eof(self):

1498

"""tuned to reduce function calls and eliminate file seeking:

1499

pass 1:

1500

reduces lsprof count from 800 to 288

1501

4168 in 296

1502

avoid U32 call by using struct format L

1503

4168 in 200

1504

"""

1505

# We've read to the end of the file, so we should have 8 bytes of

1506

# unused data in the decompressor. If we dont, there is a corrupt file.

1507

# We use these 8 bytes to calculate the CRC and the recorded file size.

1508

# We then check the that the computed CRC and size of the

1509

# uncompressed data matches the stored values. Note that the size

1510

# stored is the true file size mod 2**32.

1511

crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])

1512

# note that isize is unsigned - it can exceed 2GB

1513

if crc32 != U32(self.crc):

1514

raise IOError, "CRC check failed"

1515

elif isize != LOWU32(self.size):

1516

raise IOError, "Incorrect length of data produced"

1517

1518

def _read_gzip_header(self, bytes=None):

1519

"""Supply bytes if the minimum header size is already read.

1520

1521

:param bytes: 10 bytes of header data.

1522

"""

1523

"""starting cost: 300 in 3998

1524

15998 reads from 3998 calls

1525

final cost 168

1526

"""

1527

if bytes is None:

1528

bytes = self.fileobj.read(10)

1529

magic = bytes[0:2]

1530

if magic != '\037\213':

1531

raise IOError, 'Not a gzipped file'

1532

method = ord(bytes[2:3])

1533

if method != 8:

1534

raise IOError, 'Unknown compression method'

1535

flag = ord(bytes[3:4])

1536

# modtime = self.fileobj.read(4) (bytes [4:8])

1537

# extraflag = self.fileobj.read(1) (bytes[8:9])

1538

# os = self.fileobj.read(1) (bytes[9:10])

1539

# self.fileobj.read(6)

1540

1541

if flag & FEXTRA:

1542

# Read & discard the extra field, if present

1543

xlen = ord(self.fileobj.read(1))

1544

xlen = xlen + 256*ord(self.fileobj.read(1))

1545

self.fileobj.read(xlen)

1546

if flag & FNAME:

1547

# Read and discard a null-terminated string containing the filename

1548

while True:

1549

s = self.fileobj.read(1)

1550

if not s or s=='\000':

1551

break

1552

if flag & FCOMMENT:

1553

# Read and discard a null-terminated string containing a comment

1554

while True:

1555

s = self.fileobj.read(1)

1556

if not s or s=='\000':

1557

break

1558

if flag & FHCRC:

1559

self.fileobj.read(2) # Read & discard the 16-bit header CRC

1560

1561

def readline(self, size=-1):

1562

"""Tuned to remove buffer length calls in _unread and...

1563

1564

also removes multiple len(c) calls, inlines _unread,

1565

total savings - lsprof 5800 to 5300

1566

phase 2:

1567

4168 calls in 2233

1568

8176 calls to read() in 1684

1569

changing the min chunk size to 200 halved all the cache misses

1570

leading to a drop to:

1571

4168 calls in 1977

1572

4168 call to read() in 1646

1573

- i.e. just reduced the function call overhead. May be worth

1574

keeping.

1575

"""

1576

if size < 0: size = sys.maxint

1577

bufs = []

1578

readsize = min(200, size) # Read from the file in small chunks

1579

while True:

1580

if size == 0:

1581

return "".join(bufs) # Return resulting line

1582

1583

# c is the chunk

1584

c = self.read(readsize)

1585

# number of bytes read

1586

len_c = len(c)

1587

i = c.find('\n')

1588

if size is not None:

1589

# We set i=size to break out of the loop under two

1590

# conditions: 1) there's no newline, and the chunk is

1591

# larger than size, or 2) there is a newline, but the

1592

# resulting line would be longer than 'size'.

1593

if i==-1 and len_c > size: i=size-1

1594

elif size <= i: i = size -1

1595

1596

if i >= 0 or c == '':

1597

# if i>= 0 we have a newline or have triggered the above

1598

# if size is not None condition.

1599

# if c == '' its EOF.

1600

bufs.append(c[:i+1]) # Add portion of last chunk

1601

# -- inlined self._unread --

1602

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

1603

self.extrabuf = c[i+1:] + self.extrabuf

1604

self.extrasize = len_c - i + self.extrasize

1605

self.offset -= len_c - i

1606

# -- end inlined self._unread --

1607

return ''.join(bufs) # Return resulting line

1608

1609

# Append chunk to list, decrease 'size',

1610

bufs.append(c)

1611

size = size - len_c

1612

readsize = min(size, readsize * 2)

1613

1614

def readlines(self, sizehint=0):

1615

# optimise to avoid all the buffer manipulation

1616

# lsprof changed from:

1617

# 4168 calls in 5472 with 32000 calls to readline()

1618

# to :

1619

# 4168 calls in 417.

1620

# Negative numbers result in reading all the lines

1621

if sizehint <= 0:

1622

sizehint = -1

1623

content = self.read(sizehint)

1624

return content.splitlines(True)

1625

1626

def _unread(self, buf, len_buf=None):

1627

"""tuned to remove unneeded len calls.

1628

1629

because this is such an inner routine in readline, and readline is

1630

in many inner loops, this has been inlined into readline().

1631

1632

The len_buf parameter combined with the reduction in len calls dropped

1633

the lsprof ms count for this routine on my test data from 800 to 200 -

1634

a 75% saving.

1635

"""

1636

if len_buf is None:

1637

len_buf = len(buf)

1638

self.extrabuf = buf + self.extrabuf

1639

self.extrasize = len_buf + self.extrasize

1640

self.offset -= len_buf

1641

1405

1642

def write(self, data):

1406

1643

if self.mode != gzip.WRITE:

1407

1644

import errno

Older »