/brz/remove-bazaar : revision 1666.1.6

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/tuned_gzip.py

Committer: Robert Collins
Date: 2006-04-19 23:32:08 UTC
mto: (1711.1.1 integration)
mto: This revision was merged to the branch mainline in revision 1674.
Revision ID: robertc@robertcollins.net-20060419233208-2ed6906796994316

Make knit the default format.
Adjust affect tests to either have knit specific values or to be more generic,
as appropriate.
Disable all SFTP prefetching for known paramikos - direct readv support is now
a TODO.

files added:
.bzrignore

.rsyncexclude

BRANCH.TODO

HACKING

INSTALL

Makefile

NEWS

NEWS.developers

README

TODO

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/annotate.py

bzrlib/atomicfile.py

bzrlib/branch.py

bzrlib/builtins.py

bzrlib/bzrdir.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/commit.py

bzrlib/config.py

bzrlib/conflicts.py

bzrlib/decorators.py

bzrlib/delta.py

bzrlib/diff.py

bzrlib/doc

bzrlib/doc/__init__.py

bzrlib/doc/api

bzrlib/doc/api/__init__.py

bzrlib/doc/api/branch.txt

bzrlib/doc/api/transport.txt

bzrlib/errors.py

bzrlib/export

bzrlib/export/__init__.py

bzrlib/export/dir_exporter.py

bzrlib/export/tar_exporter.py

bzrlib/export/zip_exporter.py

bzrlib/externalcommand.py

bzrlib/fetch.py

bzrlib/gpg.py

bzrlib/graph.py

bzrlib/hashcache.py

bzrlib/help.py

bzrlib/identitymap.py

bzrlib/info.py

bzrlib/inter.py

bzrlib/intset.py

bzrlib/inventory.py

bzrlib/iterablefile.py

bzrlib/knit.py

bzrlib/lock.py

bzrlib/lockable_files.py

bzrlib/lockdir.py

bzrlib/log.py

bzrlib/lsprof.py

bzrlib/merge.py

bzrlib/merge3.py

bzrlib/missing.py

bzrlib/msgeditor.py

bzrlib/option.py

bzrlib/osutils.py

bzrlib/patch.py

bzrlib/plugin.py

bzrlib/plugins

bzrlib/plugins/__init__.py

bzrlib/progress.py

bzrlib/reconcile.py

bzrlib/repository.py

bzrlib/revision.py

bzrlib/revisionspec.py

bzrlib/rio.py

bzrlib/shellcomplete.py

bzrlib/sign_my_commits.py

bzrlib/status.py

bzrlib/store

bzrlib/store/__init__.py

bzrlib/store/revision

bzrlib/store/revision/__init__.py

bzrlib/store/revision/knit.py

bzrlib/store/revision/text.py

bzrlib/store/text.py

bzrlib/store/versioned

bzrlib/store/versioned/__init__.py

bzrlib/symbol_versioning.py

bzrlib/testament.py

bzrlib/tests

bzrlib/tests/HTTPTestUtil.py

bzrlib/tests/TestUtil.py

bzrlib/tests/__init__.py

bzrlib/tests/blackbox

bzrlib/tests/blackbox/__init__.py

bzrlib/tests/blackbox/test_added.py

bzrlib/tests/blackbox/test_aliases.py

bzrlib/tests/blackbox/test_ancestry.py

bzrlib/tests/blackbox/test_bound_branches.py

bzrlib/tests/blackbox/test_break_lock.py

bzrlib/tests/blackbox/test_cat.py

bzrlib/tests/blackbox/test_checkout.py

bzrlib/tests/blackbox/test_commit.py

bzrlib/tests/blackbox/test_conflicts.py

bzrlib/tests/blackbox/test_diff.py

bzrlib/tests/blackbox/test_export.py

bzrlib/tests/blackbox/test_find_merge_base.py

bzrlib/tests/blackbox/test_help.py

bzrlib/tests/blackbox/test_info.py

bzrlib/tests/blackbox/test_init.py

bzrlib/tests/blackbox/test_log.py

bzrlib/tests/blackbox/test_logformats.py

bzrlib/tests/blackbox/test_merge.py

bzrlib/tests/blackbox/test_missing.py

bzrlib/tests/blackbox/test_outside_wt.py

bzrlib/tests/blackbox/test_pull.py

bzrlib/tests/blackbox/test_push.py

bzrlib/tests/blackbox/test_re_sign.py

bzrlib/tests/blackbox/test_reconcile.py

bzrlib/tests/blackbox/test_revert.py

bzrlib/tests/blackbox/test_revision_info.py

bzrlib/tests/blackbox/test_revno.py

bzrlib/tests/blackbox/test_selftest.py

bzrlib/tests/blackbox/test_shared_repository.py

bzrlib/tests/blackbox/test_sign_my_commits.py

bzrlib/tests/blackbox/test_status.py

bzrlib/tests/blackbox/test_too_much.py

bzrlib/tests/blackbox/test_uncommit.py

bzrlib/tests/blackbox/test_update.py

bzrlib/tests/blackbox/test_upgrade.py

bzrlib/tests/blackbox/test_versioning.py

bzrlib/tests/branch_implementations

bzrlib/tests/branch_implementations/__init__.py

bzrlib/tests/branch_implementations/test_bound_sftp.py

bzrlib/tests/branch_implementations/test_branch.py

bzrlib/tests/branch_implementations/test_parent.py

bzrlib/tests/branch_implementations/test_permissions.py

bzrlib/tests/branch_implementations/test_pull.py

bzrlib/tests/branch_implementations/test_update.py

bzrlib/tests/bzrdir_implementations

bzrlib/tests/bzrdir_implementations/__init__.py

bzrlib/tests/bzrdir_implementations/test_bzrdir.py

bzrlib/tests/interrepository_implementations

bzrlib/tests/interrepository_implementations/__init__.py

bzrlib/tests/interrepository_implementations/test_interrepository.py

bzrlib/tests/interversionedfile_implementations

bzrlib/tests/interversionedfile_implementations/__init__.py

bzrlib/tests/interversionedfile_implementations/test_join.py

bzrlib/tests/repository_implementations

bzrlib/tests/repository_implementations/__init__.py

bzrlib/tests/repository_implementations/test_fileid_involved.py

bzrlib/tests/repository_implementations/test_reconcile.py

bzrlib/tests/repository_implementations/test_repository.py

bzrlib/tests/revisionstore_implementations

bzrlib/tests/revisionstore_implementations/__init__.py

bzrlib/tests/revisionstore_implementations/test_all.py

bzrlib/tests/stub_sftp.py

bzrlib/tests/test_ancestry.py

bzrlib/tests/test_annotate.py

bzrlib/tests/test_api.py

bzrlib/tests/test_bad_files.py

bzrlib/tests/test_branch.py

bzrlib/tests/test_bzrdir.py

bzrlib/tests/test_command.py

bzrlib/tests/test_commit.py

bzrlib/tests/test_commit_merge.py

bzrlib/tests/test_config.py

bzrlib/tests/test_conflicts.py

bzrlib/tests/test_decorators.py

bzrlib/tests/test_diff.py

bzrlib/tests/test_doc_generate.py

bzrlib/tests/test_errors.py

bzrlib/tests/test_escaped_store.py

bzrlib/tests/test_fetch.py

bzrlib/tests/test_gpg.py

bzrlib/tests/test_graph.py

bzrlib/tests/test_hashcache.py

bzrlib/tests/test_http.py

bzrlib/tests/test_identitymap.py

bzrlib/tests/test_inv.py

bzrlib/tests/test_knit.py

bzrlib/tests/test_lockable_files.py

bzrlib/tests/test_lockdir.py

bzrlib/tests/test_log.py

bzrlib/tests/test_merge.py

bzrlib/tests/test_merge3.py

bzrlib/tests/test_merge_core.py

bzrlib/tests/test_missing.py

bzrlib/tests/test_msgeditor.py

bzrlib/tests/test_nonascii.py

bzrlib/tests/test_options.py

bzrlib/tests/test_osutils.py

bzrlib/tests/test_permissions.py

bzrlib/tests/test_plugins.py

bzrlib/tests/test_progress.py

bzrlib/tests/test_reconcile.py

bzrlib/tests/test_repository.py

bzrlib/tests/test_revision.py

bzrlib/tests/test_revisionnamespaces.py

bzrlib/tests/test_revprops.py

bzrlib/tests/test_rio.py

bzrlib/tests/test_sampler.py

bzrlib/tests/test_selftest.py

bzrlib/tests/test_setup.py

bzrlib/tests/test_sftp_transport.py

bzrlib/tests/test_smart_add.py

bzrlib/tests/test_source.py

bzrlib/tests/test_store.py

bzrlib/tests/test_symbol_versioning.py

bzrlib/tests/test_testament.py

bzrlib/tests/test_trace.py

bzrlib/tests/test_transactions.py

bzrlib/tests/test_transform.py

bzrlib/tests/test_transport.py

bzrlib/tests/test_transport_implementations.py

bzrlib/tests/test_tsort.py

bzrlib/tests/test_tuned_gzip.py

bzrlib/tests/test_ui.py

bzrlib/tests/test_upgrade.py

bzrlib/tests/test_versionedfile.py

bzrlib/tests/test_weave.py

bzrlib/tests/test_whitebox.py

bzrlib/tests/test_workingtree.py

bzrlib/tests/test_xml.py

bzrlib/tests/treeshape.py

bzrlib/tests/workingtree_implementations

bzrlib/tests/workingtree_implementations/__init__.py

bzrlib/tests/workingtree_implementations/test_basis_inventory.py

bzrlib/tests/workingtree_implementations/test_is_control_filename.py

bzrlib/tests/workingtree_implementations/test_pull.py

bzrlib/tests/workingtree_implementations/test_workingtree.py

bzrlib/textinv.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/transactions.py

bzrlib/transform.py

bzrlib/transport

bzrlib/transport/__init__.py

bzrlib/transport/decorator.py

bzrlib/transport/fakenfs.py

bzrlib/transport/fakevfat.py

bzrlib/transport/ftp.py

bzrlib/transport/http

bzrlib/transport/http/__init__.py

bzrlib/transport/http/_pycurl.py

bzrlib/transport/http/_urllib.py

bzrlib/transport/local.py

bzrlib/transport/memory.py

bzrlib/transport/readonly.py

bzrlib/transport/sftp.py

bzrlib/tree.py

bzrlib/tsort.py

bzrlib/tuned_gzip.py

bzrlib/ui

bzrlib/ui/__init__.py

bzrlib/ui/text.py

bzrlib/uncommit.py

bzrlib/upgrade.py

bzrlib/util

bzrlib/util/__init__.py

bzrlib/util/configobj

bzrlib/util/configobj/__init__.py

bzrlib/util/configobj/configobj.py

bzrlib/util/configobj/docs

bzrlib/util/configobj/docs/BSD-LICENSE.txt

bzrlib/util/configobj/docs/configobj.txt

bzrlib/util/configobj/docs/validate.txt

bzrlib/util/configobj/validate.py

bzrlib/util/effbot

bzrlib/util/effbot/__init__.py

bzrlib/util/effbot/org

bzrlib/util/effbot/org/__init__.py

bzrlib/util/effbot/org/gzip_consumer.py

bzrlib/util/effbot/org/http_client.py

bzrlib/util/effbot/org/http_manager.py

bzrlib/util/elementtree

bzrlib/util/elementtree/ElementTree.py

bzrlib/util/elementtree/__init__.py

bzrlib/util/urlgrabber

bzrlib/util/urlgrabber/__init__.py

bzrlib/util/urlgrabber/byterange.py

bzrlib/util/urlgrabber/grabber.py

bzrlib/util/urlgrabber/keepalive.py

bzrlib/util/urlgrabber/mirror.py

bzrlib/util/urlgrabber/progress.py

bzrlib/versionedfile.py

bzrlib/weave.py

bzrlib/weave_commands.py

bzrlib/weavefile.py

bzrlib/win32console.py

bzrlib/workingtree.py

bzrlib/xml4.py

bzrlib/xml5.py

bzrlib/xml_serializer.py

contrib

contrib/add-bzr-to-baz

contrib/bash

contrib/bash/bzr

contrib/bash/bzr.simple

contrib/create_bzr_rollup.py

contrib/emacs

contrib/emacs/bzr-mode.el

contrib/fortune

contrib/newinventory.py

contrib/pwclient.full

contrib/pwk

contrib/upload-bzr.dev

contrib/zsh

contrib/zsh/_bzr

generate_docs.py

notes

setup.py

tools

tools/__init__.py

tools/biobench.py

tools/capture_tree.py

tools/convertfile.py

tools/convertinv.py

tools/doc_generate

tools/doc_generate/__init__.py

tools/doc_generate/autodoc_bash_completion.py

tools/doc_generate/autodoc_man.py

tools/history2revfiles.py

tools/http_client.py

tools/riodemo.py

tools/trace-revisions

tools/weavebench.py

tools/weavemerge.sh

tutorial.txt

files removed:
.bzrignore

COPYING

HACKING

INSTALL

Makefile

NEWS

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

cache.py

commands.py

commit.py

config.py

dir.py

errors.py

fetch.py

help.py

hg.py

info.py

inventory.py

mapping.py

notes

notes/git-serve.txt

notes/mapping.txt

notes/roundtripping.txt

object_store.py

push.py

refs.py

remote.py

repository.py

revspec.py

roundtrip.py

send.py

server.py

setup.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_cache.py

tests/test_dir.py

tests/test_fetch.py

tests/test_mapping.py

tests/test_object_store.py

tests/test_push.py

tests/test_refs.py

tests/test_remote.py

tests/test_repository.py

tests/test_revspec.py

tests/test_roundtrip.py

tests/test_transportgit.py

transportgit.py

tree.py

versionedfiles.py

workingtree.py

Show diffs side-by-side

added added

removed removed

bzrlib/tuned_gzip.py

# Written by Robert Collins <robert.collins@canonical.com>

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Bzrlib specific gzip tunings. We plan to feed these to the upstream gzip."""

# make GzipFile faster:

import gzip

from gzip import U32, LOWU32, FEXTRA, FCOMMENT, FNAME, FHCRC

import sys

import struct

import zlib

# we want a \n preserved, break on \n only splitlines.

import bzrlib

__all__ = ["GzipFile"]

class GzipFile(gzip.GzipFile):

"""Knit tuned version of GzipFile.

This is based on the following lsprof stats:

python 2.4 stock GzipFile write:

58971 0 5644.3090 2721.4730 gzip:193(write)

+58971 0 1159.5530 1159.5530 +<built-in method compress>

+176913 0 987.0320 987.0320 +<len>

+58971 0 423.1450 423.1450 +<zlib.crc32>

+58971 0 353.1060 353.1060 +<method 'write' of 'cStringIO.

StringO' objects>

tuned GzipFile write:

58971 0 4477.2590 2103.1120 bzrlib.knit:1250(write)

+58971 0 1297.7620 1297.7620 +<built-in method compress>

+58971 0 406.2160 406.2160 +<zlib.crc32>

+58971 0 341.9020 341.9020 +<method 'write' of 'cStringIO.

StringO' objects>

+58971 0 328.2670 328.2670 +<len>

Yes, its only 1.6 seconds, but they add up.

"""

def _add_read_data(self, data):

# 4169 calls in 183

# temp var for len(data) and switch to +='s.

# 4169 in 139

len_data = len(data)

self.crc = zlib.crc32(data, self.crc)

self.extrabuf += data

self.extrasize += len_data

self.size += len_data

def _read(self, size=1024):

# various optimisations:

# reduces lsprof count from 2500 to

# 8337 calls in 1272, 365 internal

if self.fileobj is None:

raise EOFError, "Reached EOF"

if self._new_member:

# If the _new_member flag is set, we have to

# jump to the next member, if there is one.

# First, check if we're at the end of the file;

# if so, it's time to stop; no more members to read.

next_header_bytes = self.fileobj.read(10)

if next_header_bytes == '':

raise EOFError, "Reached EOF"

self._init_read()

self._read_gzip_header(next_header_bytes)

self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)

self._new_member = False

# Read a chunk of data from the file

buf = self.fileobj.read(size)

# If the EOF has been reached, flush the decompression object

# and mark this object as finished.

if buf == "":

self._add_read_data(self.decompress.flush())

assert len(self.decompress.unused_data) >= 8, "what does flush do?"

self._read_eof()

# tell the driving read() call we have stuffed all the data

# in self.extrabuf

100

raise EOFError, 'Reached EOF'

101

102

self._add_read_data(self.decompress.decompress(buf))

103

104

if self.decompress.unused_data != "":

105

# Ending case: we've come to the end of a member in the file,

106

# so seek back to the start of the data for the next member which

107

# is the length of the decompress objects unused data - the first

108

# 8 bytes for the end crc and size records.

109

110

# so seek back to the start of the unused data, finish up

111

# this member, and read a new gzip header.

112

# (The number of bytes to seek back is the length of the unused

113

# data, minus 8 because those 8 bytes are part of this member.

114

seek_length = len (self.decompress.unused_data) - 8

115

if seek_length > 0:

116

# we read too much data

117

self.fileobj.seek(-seek_length, 1)

118

elif seek_length < 0:

119

# we haven't read enough to check the checksum.

120

assert -8 < seek_length, "too great a seek."

121

buf = self.fileobj.read(-seek_length)

122

self.decompress.decompress(buf)

123

124

# Check the CRC and file size, and set the flag so we read

125

# a new member on the next call

126

self._read_eof()

127

self._new_member = True

128

129

def _read_eof(self):

130

"""tuned to reduce function calls and eliminate file seeking:

131

pass 1:

132

reduces lsprof count from 800 to 288

133

4168 in 296

134

avoid U32 call by using struct format L

135

4168 in 200

136

"""

137

# We've read to the end of the file, so we should have 8 bytes of

138

# unused data in the decompressor. If we dont, there is a corrupt file.

139

# We use these 8 bytes to calculate the CRC and the recorded file size.

140

# We then check the that the computed CRC and size of the

141

# uncompressed data matches the stored values. Note that the size

142

# stored is the true file size mod 2**32.

143

crc32, isize = struct.unpack("<LL", self.decompress.unused_data[0:8])

144

# note that isize is unsigned - it can exceed 2GB

145

if crc32 != U32(self.crc):

146

raise IOError, "CRC check failed %d %d" % (crc32, U32(self.crc))

147

elif isize != LOWU32(self.size):

148

raise IOError, "Incorrect length of data produced"

149

150

def _read_gzip_header(self, bytes=None):

151

"""Supply bytes if the minimum header size is already read.

152

153

:param bytes: 10 bytes of header data.

154

"""

155

"""starting cost: 300 in 3998

156

15998 reads from 3998 calls

157

final cost 168

158

"""

159

if bytes is None:

160

bytes = self.fileobj.read(10)

161

magic = bytes[0:2]

162

if magic != '\037\213':

163

raise IOError, 'Not a gzipped file'

164

method = ord(bytes[2:3])

165

if method != 8:

166

raise IOError, 'Unknown compression method'

167

flag = ord(bytes[3:4])

168

# modtime = self.fileobj.read(4) (bytes [4:8])

169

# extraflag = self.fileobj.read(1) (bytes[8:9])

170

# os = self.fileobj.read(1) (bytes[9:10])

171

# self.fileobj.read(6)

172

173

if flag & FEXTRA:

174

# Read & discard the extra field, if present

175

xlen = ord(self.fileobj.read(1))

176

xlen = xlen + 256*ord(self.fileobj.read(1))

177

self.fileobj.read(xlen)

178

if flag & FNAME:

179

# Read and discard a null-terminated string containing the filename

180

while True:

181

s = self.fileobj.read(1)

182

if not s or s=='\000':

183

break

184

if flag & FCOMMENT:

185

# Read and discard a null-terminated string containing a comment

186

while True:

187

s = self.fileobj.read(1)

188

if not s or s=='\000':

189

break

190

if flag & FHCRC:

191

self.fileobj.read(2) # Read & discard the 16-bit header CRC

192

193

def readline(self, size=-1):

194

"""Tuned to remove buffer length calls in _unread and...

195

196

also removes multiple len(c) calls, inlines _unread,

197

total savings - lsprof 5800 to 5300

198

phase 2:

199

4168 calls in 2233

200

8176 calls to read() in 1684

201

changing the min chunk size to 200 halved all the cache misses

202

leading to a drop to:

203

4168 calls in 1977

204

4168 call to read() in 1646

205

- i.e. just reduced the function call overhead. May be worth

206

keeping.

207

"""

208

if size < 0: size = sys.maxint

209

bufs = []

210

readsize = min(200, size) # Read from the file in small chunks

211

while True:

212

if size == 0:

213

return "".join(bufs) # Return resulting line

214

215

# c is the chunk

216

c = self.read(readsize)

217

# number of bytes read

218

len_c = len(c)

219

i = c.find('\n')

220

if size is not None:

221

# We set i=size to break out of the loop under two

222

# conditions: 1) there's no newline, and the chunk is

223

# larger than size, or 2) there is a newline, but the

224

# resulting line would be longer than 'size'.

225

if i==-1 and len_c > size: i=size-1

226

elif size <= i: i = size -1

227

228

if i >= 0 or c == '':

229

# if i>= 0 we have a newline or have triggered the above

230

# if size is not None condition.

231

# if c == '' its EOF.

232

bufs.append(c[:i+1]) # Add portion of last chunk

233

# -- inlined self._unread --

234

## self._unread(c[i+1:], len_c - i) # Push back rest of chunk

235

self.extrabuf = c[i+1:] + self.extrabuf

236

self.extrasize = len_c - i + self.extrasize

237

self.offset -= len_c - i

238

# -- end inlined self._unread --

239

return ''.join(bufs) # Return resulting line

240

241

# Append chunk to list, decrease 'size',

242

bufs.append(c)

243

size = size - len_c

244

readsize = min(size, readsize * 2)

245

246

def readlines(self, sizehint=0):

247

# optimise to avoid all the buffer manipulation

248

# lsprof changed from:

249

# 4168 calls in 5472 with 32000 calls to readline()

250

# to :

251

# 4168 calls in 417.

252

# Negative numbers result in reading all the lines

253

if sizehint <= 0:

254

sizehint = -1

255

content = self.read(sizehint)

256

return bzrlib.osutils.split_lines(content)

257

258

def _unread(self, buf, len_buf=None):

259

"""tuned to remove unneeded len calls.

260

261

because this is such an inner routine in readline, and readline is

262

in many inner loops, this has been inlined into readline().

263

264

The len_buf parameter combined with the reduction in len calls dropped

265

the lsprof ms count for this routine on my test data from 800 to 200 -

266

a 75% saving.

267

"""

268

if len_buf is None:

269

len_buf = len(buf)

270

self.extrabuf = buf + self.extrabuf

271

self.extrasize = len_buf + self.extrasize

272

self.offset -= len_buf

273

274

def write(self, data):

275

if self.mode != gzip.WRITE:

276

import errno

277

raise IOError(errno.EBADF, "write() on read-only GzipFile object")

278

279

if self.fileobj is None:

280

raise ValueError, "write() on closed GzipFile object"

281

data_len = len(data)

282

if data_len > 0:

283

self.size = self.size + data_len

284

self.crc = zlib.crc32(data, self.crc)

285

self.fileobj.write( self.compress.compress(data) )

286

self.offset += data_len

287

288

def writelines(self, lines):

289

# profiling indicated a significant overhead

290

# calling write for each line.

291

# this batch call is a lot faster :).

292

# (4 seconds to 1 seconds for the sample upgrades I was testing).

293

self.write(''.join(lines))

294

295

Older »