/brz/remove-bazaar : revision 1664.2.9

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: Aaron Bentley
Date: 2006-04-18 23:42:34 UTC
mto: This revision was merged to the branch mainline in revision 1672.
Revision ID: aaron.bentley@utoronto.ca-20060418234234-d3d230b99ba70b9f

Ported weave merge test to versionedfile

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_textmerge.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textinv.py

bzrlib/textmerge.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

generate_docs.py

notes

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
.bzrignore

COPYING

INSTALL

Makefile

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

commands.py

converter.py

dir.py

errors.py

fetch.py

foreign

foreign/.bzrignore

foreign/TODO

foreign/__init__.py

foreign/test_versionedfiles.py

foreign/upgrade.py

foreign/versionedfiles.py

mapping.py

notes

notes/roundtripping.txt

remote.py

repository.py

revspec.py

server.py

setup.py

shamap.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_dir.py

tests/test_fetch.py

tests/test_ids.py

tests/test_repository.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

# Written by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

# make GzipFile faster:

import gzip

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

import sys

import struct

import zlib

__all__ = ["GzipFile"]

class GzipFile(gzip.GzipFile):

"""Knit tuned version of GzipFile.

This is based on the following lsprof stats:

python 2.4 stock GzipFile write:

58971 0 5644.3090 2721.4730 gzip:193(write)

+58971 0 1159.5530 1159.5530 +<built-in method compress>

+176913 0 987.0320 987.0320 +<len>

+58971 0 423.1450 423.1450 +<zlib.crc32>

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

StringO' objects>

tuned GzipFile write:

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

+58971 0 1297.7620 1297.7620 +<built-in method compress>

+58971 0 406.2160 406.2160 +<zlib.crc32>

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

StringO' objects>

+58971 0 328.2670 328.2670 +<len>

Yes, its only 1.6 seconds, but they add up.

"""

def _add_read_data(self, data):

# 4169 calls in 183

# temp var for len(data) and switch to +='s.

# 4169 in 139

len_data = len(data)

self.crc = zlib.crc32(data, self.crc)

self.extrabuf += data

self.extrasize += len_data

self.size += len_data

def _read(self, size=1024):

# various optimisations:

# reduces lsprof count from 2500 to

# 8337 calls in 1272, 365 internal

if self.fileobj is None:

raise EOFError, "Reached EOF"

if self._new_member:

# If the _new_member flag is set, we have to

# jump to the next member, if there is one.

# First, check if we're at the end of the file;

# if so, it's time to stop; no more members to read.

next_header_bytes = self.fileobj.read(10)

if next_header_bytes == '':

raise EOFError, "Reached EOF"

self._init_read()

self._read_gzip_header(next_header_bytes)

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

self._new_member = False

# Read a chunk of data from the file

buf = self.fileobj.read(size)

# If the EOF has been reached, flush the decompression object

# and mark this object as finished.

if buf == "":

self._add_read_data(self.decompress.flush())

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

self._read_eof()

# tell the driving read() call we have stuffed all the data

# in self.extrabuf

raise EOFError, 'Reached EOF'

self._add_read_data(self.decompress.decompress(buf))

100

101

if self.decompress.unused_data != "":

102

# Ending case: we've come to the end of a member in the file,

103

# so seek back to the start of the data for the next member which

104

# is the length of the decompress objects unused data - the first

105

# 8 bytes for the end crc and size records.

106

107

# so seek back to the start of the unused data, finish up

108

# this member, and read a new gzip header.

109

# (The number of bytes to seek back is the length of the unused

110

# data, minus 8 because those 8 bytes are part of this member.

111

seek_length = len (self.decompress.unused_data) - 8

112

if seek_length > 0:

113

# we read too much data

114

self.fileobj.seek(-seek_length, 1)

115

elif seek_length < 0:

116

# we haven't read enough to check the checksum.

117

assert -8 < seek_length, "too great a seek."

118

buf = self.fileobj.read(-seek_length)

119

self.decompress.decompress(buf)

120

121

# Check the CRC and file size, and set the flag so we read

122

# a new member on the next call

123

self._read_eof()

124

self._new_member = True

125

126

def _read_eof(self):

127

"""tuned to reduce function calls and eliminate file seeking:

128

pass 1:

129

reduces lsprof count from 800 to 288

130

4168 in 296

131

avoid U32 call by using struct format L

132

4168 in 200

133

"""

134

# We've read to the end of the file, so we should have 8 bytes of

135

# unused data in the decompressor. If we dont, there is a corrupt file.

136

# We use these 8 bytes to calculate the CRC and the recorded file size.

137

# We then check the that the computed CRC and size of the

138

# uncompressed data matches the stored values. Note that the size

139

# stored is the true file size mod 2**32.

140

crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])

141

# note that isize is unsigned - it can exceed 2GB

142

if crc32 != U32(self.crc):

143

raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))

144

elif isize != LOWU32(self.size):

145

raise IOError, "Incorrect length of data produced"

146

147

def _read_gzip_header(self, bytes=None):

148

"""Supply bytes if the minimum header size is already read.

149

150

:param bytes: 10 bytes of header data.

151

"""

152

"""starting cost: 300 in 3998

153

15998 reads from 3998 calls

154

final cost 168

155

"""

156

if bytes is None:

157

bytes = self.fileobj.read(10)

158

magic = bytes[0:2]

159

if magic != '\037\213':

160

raise IOError, 'Not a gzipped file'

161

method = ord(bytes[2:3])

162

if method != 8:

163

raise IOError, 'Unknown compression method'

164

flag = ord(bytes[3:4])

165

# modtime = self.fileobj.read(4) (bytes [4:8])

166

# extraflag = self.fileobj.read(1) (bytes[8:9])

167

# os = self.fileobj.read(1) (bytes[9:10])

168

# self.fileobj.read(6)

169

170

if flag & FEXTRA:

171

# Read & discard the extra field, if present

172

xlen = ord(self.fileobj.read(1))

173

xlen = xlen + 256*ord(self.fileobj.read(1))

174

self.fileobj.read(xlen)

175

if flag & FNAME:

176

# Read and discard a null-terminated string containing the filename

177

while True:

178

s = self.fileobj.read(1)

179

if not s or s=='\000':

180

break

181

if flag & FCOMMENT:

182

# Read and discard a null-terminated string containing a comment

183

while True:

184

s = self.fileobj.read(1)

185

if not s or s=='\000':

186

break

187

if flag & FHCRC:

188

self.fileobj.read(2) # Read & discard the 16-bit header CRC

189

190

def readline(self, size=-1):

191

"""Tuned to remove buffer length calls in _unread and...

192

193

also removes multiple len(c) calls, inlines _unread,

194

total savings - lsprof 5800 to 5300

195

phase 2:

196

4168 calls in 2233

197

8176 calls to read() in 1684

198

changing the min chunk size to 200 halved all the cache misses

199

leading to a drop to:

200

4168 calls in 1977

201

4168 call to read() in 1646

202

- i.e. just reduced the function call overhead. May be worth

203

keeping.

204

"""

205

if size < 0: size = sys.maxint

206

bufs = []

207

readsize = min(200, size) # Read from the file in small chunks

208

while True:

209

if size == 0:

210

return "".join(bufs) # Return resulting line

211

212

# c is the chunk

213

c = self.read(readsize)

214

# number of bytes read

215

len_c = len(c)

216

i = c.find('\n')

217

if size is not None:

218

# We set i=size to break out of the loop under two

219

# conditions: 1) there's no newline, and the chunk is

220

# larger than size, or 2) there is a newline, but the

221

# resulting line would be longer than 'size'.

222

if i==-1 and len_c > size: i=size-1

223

elif size <= i: i = size -1

224

225

if i >= 0 or c == '':

226

# if i>= 0 we have a newline or have triggered the above

227

# if size is not None condition.

228

# if c == '' its EOF.

229

bufs.append(c[:i+1]) # Add portion of last chunk

230

# -- inlined self._unread --

231

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

232

self.extrabuf = c[i+1:] + self.extrabuf

233

self.extrasize = len_c - i + self.extrasize

234

self.offset -= len_c - i

235

# -- end inlined self._unread --

236

return ''.join(bufs) # Return resulting line

237

238

# Append chunk to list, decrease 'size',

239

bufs.append(c)

240

size = size - len_c

241

readsize = min(size, readsize * 2)

242

243

def readlines(self, sizehint=0):

244

# optimise to avoid all the buffer manipulation

245

# lsprof changed from:

246

# 4168 calls in 5472 with 32000 calls to readline()

247

# to :

248

# 4168 calls in 417.

249

# Negative numbers result in reading all the lines

250

if sizehint <= 0:

251

sizehint = -1

252

content = self.read(sizehint)

253

return content.splitlines(True)

254

255

def _unread(self, buf, len_buf=None):

256

"""tuned to remove unneeded len calls.

257

258

because this is such an inner routine in readline, and readline is

259

in many inner loops, this has been inlined into readline().

260

261

The len_buf parameter combined with the reduction in len calls dropped

262

the lsprof ms count for this routine on my test data from 800 to 200 -

263

a 75% saving.

264

"""

265

if len_buf is None:

266

len_buf = len(buf)

267

self.extrabuf = buf + self.extrabuf

268

self.extrasize = len_buf + self.extrasize

269

self.offset -= len_buf

270

271

def write(self, data):

272

if self.mode != gzip.WRITE:

273

import errno

274

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

275

276

if self.fileobj is None:

277

raise ValueError, "write() on closed GzipFile object"

278

data_len = len(data)

279

if data_len > 0:

280

self.size = self.size + data_len

281

self.crc = zlib.crc32(data, self.crc)

282

self.fileobj.write( self.compress.compress(data) )

283

self.offset += data_len

284

285

def writelines(self, lines):

286

# profiling indicated a significant overhead

287

# calling write for each line.

288

# this batch call is a lot faster :).

289

# (4 seconds to 1 seconds for the sample upgrades I was testing).

290

self.write(''.join(lines))

291

292

Older »