/brz/remove-bazaar : revision 1662.1.5

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: Martin Pool
Date: 2006-04-18 06:49:41 UTC
mto: This revision was merged to the branch mainline in revision 1670.
Revision ID: mbp@sourcefrog.net-20060418064941-d6a8c9a334e9e7e1

(test_escaped_store) Avoid calling deprecated WeaveStore.get_lines method

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_basis_inventory.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

generate_docs.py

notes

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
.bzrignore

COPYING

INSTALL

Makefile

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

commands.py

converter.py

dir.py

errors.py

fetch.py

foreign

foreign/.bzrignore

foreign/TODO

foreign/__init__.py

foreign/test_versionedfiles.py

foreign/upgrade.py

foreign/versionedfiles.py

mapping.py

notes

notes/roundtripping.txt

remote.py

repository.py

revspec.py

server.py

setup.py

shamap.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_dir.py

tests/test_fetch.py

tests/test_ids.py

tests/test_repository.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

# Written by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

# make GzipFile faster:

import gzip

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

import sys

import struct

import zlib

__all__ = ["GzipFile"]

class GzipFile(gzip.GzipFile):

"""Knit tuned version of GzipFile.

This is based on the following lsprof stats:

python 2.4 stock GzipFile write:

58971 0 5644.3090 2721.4730 gzip:193(write)

+58971 0 1159.5530 1159.5530 +<built-in method compress>

+176913 0 987.0320 987.0320 +<len>

+58971 0 423.1450 423.1450 +<zlib.crc32>

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

StringO' objects>

tuned GzipFile write:

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

+58971 0 1297.7620 1297.7620 +<built-in method compress>

+58971 0 406.2160 406.2160 +<zlib.crc32>

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

StringO' objects>

+58971 0 328.2670 328.2670 +<len>

Yes, its only 1.6 seconds, but they add up.

"""

def _add_read_data(self, data):

# 4169 calls in 183

# temp var for len(data) and switch to +='s.

# 4169 in 139

len_data = len(data)

self.crc = zlib.crc32(data, self.crc)

self.extrabuf += data

self.extrasize += len_data

self.size += len_data

def _read(self, size=1024):

# various optimisations:

# reduces lsprof count from 2500 to

# 8337 calls in 1272, 365 internal

if self.fileobj is None:

raise EOFError, "Reached EOF"

if self._new_member:

# If the _new_member flag is set, we have to

# jump to the next member, if there is one.

# First, check if we're at the end of the file;

# if so, it's time to stop; no more members to read.

next_header_bytes = self.fileobj.read(10)

if next_header_bytes == '':

raise EOFError, "Reached EOF"

self._init_read()

self._read_gzip_header(next_header_bytes)

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

self._new_member = False

# Read a chunk of data from the file

buf = self.fileobj.read(size)

# If the EOF has been reached, flush the decompression object

# and mark this object as finished.

if buf == "":

self._add_read_data(self.decompress.flush())

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

self._read_eof()

# tell the driving read() call we have stuffed all the data

# in self.extrabuf

raise EOFError, 'Reached EOF'

self._add_read_data(self.decompress.decompress(buf))

100

101

if self.decompress.unused_data != "":

102

# Ending case: we've come to the end of a member in the file,

103

# so seek back to the start of the data for the next member which

104

# is the length of the decompress objects unused data - the first

105

# 8 bytes for the end crc and size records.

106

107

# so seek back to the start of the unused data, finish up

108

# this member, and read a new gzip header.

109

# (The number of bytes to seek back is the length of the unused

110

# data, minus 8 because those 8 bytes are part of this member.

111

seek_length = len (self.decompress.unused_data) - 8

112

if seek_length:

113

assert seek_length > 0

114

self.fileobj.seek(-seek_length, 1)

115

116

# Check the CRC and file size, and set the flag so we read

117

# a new member on the next call

118

self._read_eof()

119

self._new_member = True

120

121

def _read_eof(self):

122

"""tuned to reduce function calls and eliminate file seeking:

123

pass 1:

124

reduces lsprof count from 800 to 288

125

4168 in 296

126

avoid U32 call by using struct format L

127

4168 in 200

128

"""

129

# We've read to the end of the file, so we should have 8 bytes of

130

# unused data in the decompressor. If we dont, there is a corrupt file.

131

# We use these 8 bytes to calculate the CRC and the recorded file size.

132

# We then check the that the computed CRC and size of the

133

# uncompressed data matches the stored values. Note that the size

134

# stored is the true file size mod 2**32.

135

crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])

136

# note that isize is unsigned - it can exceed 2GB

137

if crc32 != U32(self.crc):

138

raise IOError, "CRC check failed"

139

elif isize != LOWU32(self.size):

140

raise IOError, "Incorrect length of data produced"

141

142

def _read_gzip_header(self, bytes=None):

143

"""Supply bytes if the minimum header size is already read.

144

145

:param bytes: 10 bytes of header data.

146

"""

147

"""starting cost: 300 in 3998

148

15998 reads from 3998 calls

149

final cost 168

150

"""

151

if bytes is None:

152

bytes = self.fileobj.read(10)

153

magic = bytes[0:2]

154

if magic != '\037\213':

155

raise IOError, 'Not a gzipped file'

156

method = ord(bytes[2:3])

157

if method != 8:

158

raise IOError, 'Unknown compression method'

159

flag = ord(bytes[3:4])

160

# modtime = self.fileobj.read(4) (bytes [4:8])

161

# extraflag = self.fileobj.read(1) (bytes[8:9])

162

# os = self.fileobj.read(1) (bytes[9:10])

163

# self.fileobj.read(6)

164

165

if flag & FEXTRA:

166

# Read & discard the extra field, if present

167

xlen = ord(self.fileobj.read(1))

168

xlen = xlen + 256*ord(self.fileobj.read(1))

169

self.fileobj.read(xlen)

170

if flag & FNAME:

171

# Read and discard a null-terminated string containing the filename

172

while True:

173

s = self.fileobj.read(1)

174

if not s or s=='\000':

175

break

176

if flag & FCOMMENT:

177

# Read and discard a null-terminated string containing a comment

178

while True:

179

s = self.fileobj.read(1)

180

if not s or s=='\000':

181

break

182

if flag & FHCRC:

183

self.fileobj.read(2) # Read & discard the 16-bit header CRC

184

185

def readline(self, size=-1):

186

"""Tuned to remove buffer length calls in _unread and...

187

188

also removes multiple len(c) calls, inlines _unread,

189

total savings - lsprof 5800 to 5300

190

phase 2:

191

4168 calls in 2233

192

8176 calls to read() in 1684

193

changing the min chunk size to 200 halved all the cache misses

194

leading to a drop to:

195

4168 calls in 1977

196

4168 call to read() in 1646

197

- i.e. just reduced the function call overhead. May be worth

198

keeping.

199

"""

200

if size < 0: size = sys.maxint

201

bufs = []

202

readsize = min(200, size) # Read from the file in small chunks

203

while True:

204

if size == 0:

205

return "".join(bufs) # Return resulting line

206

207

# c is the chunk

208

c = self.read(readsize)

209

# number of bytes read

210

len_c = len(c)

211

i = c.find('\n')

212

if size is not None:

213

# We set i=size to break out of the loop under two

214

# conditions: 1) there's no newline, and the chunk is

215

# larger than size, or 2) there is a newline, but the

216

# resulting line would be longer than 'size'.

217

if i==-1 and len_c > size: i=size-1

218

elif size <= i: i = size -1

219

220

if i >= 0 or c == '':

221

# if i>= 0 we have a newline or have triggered the above

222

# if size is not None condition.

223

# if c == '' its EOF.

224

bufs.append(c[:i+1]) # Add portion of last chunk

225

# -- inlined self._unread --

226

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

227

self.extrabuf = c[i+1:] + self.extrabuf

228

self.extrasize = len_c - i + self.extrasize

229

self.offset -= len_c - i

230

# -- end inlined self._unread --

231

return ''.join(bufs) # Return resulting line

232

233

# Append chunk to list, decrease 'size',

234

bufs.append(c)

235

size = size - len_c

236

readsize = min(size, readsize * 2)

237

238

def readlines(self, sizehint=0):

239

# optimise to avoid all the buffer manipulation

240

# lsprof changed from:

241

# 4168 calls in 5472 with 32000 calls to readline()

242

# to :

243

# 4168 calls in 417.

244

# Negative numbers result in reading all the lines

245

if sizehint <= 0:

246

sizehint = -1

247

content = self.read(sizehint)

248

return content.splitlines(True)

249

250

def _unread(self, buf, len_buf=None):

251

"""tuned to remove unneeded len calls.

252

253

because this is such an inner routine in readline, and readline is

254

in many inner loops, this has been inlined into readline().

255

256

The len_buf parameter combined with the reduction in len calls dropped

257

the lsprof ms count for this routine on my test data from 800 to 200 -

258

a 75% saving.

259

"""

260

if len_buf is None:

261

len_buf = len(buf)

262

self.extrabuf = buf + self.extrabuf

263

self.extrasize = len_buf + self.extrasize

264

self.offset -= len_buf

265

266

def write(self, data):

267

if self.mode != gzip.WRITE:

268

import errno

269

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

270

271

if self.fileobj is None:

272

raise ValueError, "write() on closed GzipFile object"

273

data_len = len(data)

274

if data_len > 0:

275

self.size = self.size + data_len

276

self.crc = zlib.crc32(data, self.crc)

277

self.fileobj.write( self.compress.compress(data) )

278

self.offset += data_len

279

280

def writelines(self, lines):

281

# profiling indicated a significant overhead

282

# calling write for each line.

283

# this batch call is a lot faster :).

284

# (4 seconds to 1 seconds for the sample upgrades I was testing).

285

self.write(''.join(lines))

286

287

Older »