/brz/remove-bazaar : revision 0.200.1036

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

« back to all changes in this revision

Viewing changes to bzrlib/revfile.py

Committer: Jelmer Vernooij
Date: 2010-09-11 17:48:45 UTC
mto: (0.312.1 master) (6883.23.1 bundle-git)
mto: This revision was merged to the branch mainline in revision 6960.
Revision ID: jelmer@samba.org-20100911174845-ro06bb6gg6ws8jit

More work on roundtrip push support.

files added:
.bzrignore

COPYING

HACKING

INSTALL

Makefile

NEWS

README

TODO

__init__.py

branch.py

bzr-receive-pack

bzr-upload-pack

cache.py

commands.py

commit.py

config.py

dir.py

errors.py

fetch.py

help.py

hg.py

info.py

inventory.py

mapping.py

notes

notes/git-serve.txt

notes/mapping.txt

notes/roundtripping.txt

object_store.py

push.py

refs.py

remote.py

repository.py

revspec.py

roundtrip.py

send.py

server.py

setup.py

tests

tests/__init__.py

tests/test_blackbox.py

tests/test_branch.py

tests/test_builder.py

tests/test_cache.py

tests/test_dir.py

tests/test_fetch.py

tests/test_mapping.py

tests/test_object_store.py

tests/test_push.py

tests/test_refs.py

tests/test_remote.py

tests/test_repository.py

tests/test_revspec.py

tests/test_roundtrip.py

tests/test_transportgit.py

transportgit.py

tree.py

versionedfiles.py

workingtree.py

files removed:
.bzrignore

.rsyncexclude

NEWS

README

build-api

bzrlib

bzrlib/__init__.py

bzrlib/add.py

bzrlib/branch.py

bzrlib/check.py

bzrlib/commands.py

bzrlib/diff.py

bzrlib/errors.py

bzrlib/info.py

bzrlib/inventory.py

bzrlib/mdiff.py

bzrlib/newinventory.py

bzrlib/osutils.py

bzrlib/remotebranch.py

bzrlib/revfile.py

bzrlib/revision.py

bzrlib/store.py

bzrlib/tests.py

bzrlib/textui.py

bzrlib/trace.py

bzrlib/tree.py

bzrlib/xml.py

doc/Makefile

doc/adoption.txt

doc/bitkeeper.txt

doc/changelogs.txt

doc/cherry-picking.txt

doc/cmdref.txt

doc/common-format.txt

doc/compared-aegis.txt

doc/compared-codeville.txt

doc/compared-cvsnt.txt

doc/compared-opencm.txt

doc/compared-prcs.txt

doc/compared-teamware.txt

doc/compression.txt

doc/config-specs.txt

doc/conflicts.txt

doc/costs.txt

doc/darcs.txt

doc/deadly-sins.txt

doc/default.css

doc/design.txt

doc/extra-commands.txt

doc/faq.txt

doc/formats.txt

doc/hashes.txt

doc/ignore.txt

doc/index.txt

doc/interrupted.txt

doc/intro.txt

doc/inventory.txt

doc/join-branches.txt

doc/kill-version.txt

doc/layers.txt

doc/library-interface.txt

doc/merge.txt

doc/mirroring.txt

doc/monotone.txt

doc/news.txt

doc/optional-edit.txt

doc/partial-commit.txt

doc/pool.txt

doc/purpose.txt

doc/python.txt

doc/quickref.txt

doc/quilt.txt

doc/quotes.txt

doc/random.txt

doc/requirements.txt

doc/revision-syntax.txt

doc/roadmap.txt

doc/rollup.txt

doc/scalability.txt

doc/security.txt

doc/shared-branches.txt

doc/short-demo.txt

doc/supportability.txt

doc/svk.txt

doc/tagging.txt

doc/taxonomy.txt

doc/testing.txt

doc/thanks.txt

doc/todo-from-arch.txt

doc/unchanged.txt

doc/unrelated-merge.txt

doc/usability.txt

doc/use-cases.txt

doc/web-interface.txt

doc/work-order.txt

doc/workflow.txt

doc/yaml.txt

elementtree

elementtree/ElementTree.py

elementtree/__init__.py

notes

notes/new-inventory-sample.xml

notes/performance.txt

setup.py

test.sh

urlgrabber

urlgrabber/__init__.py

urlgrabber/byterange.py

urlgrabber/grabber.py

urlgrabber/keepalive.py

urlgrabber/mirror.py

urlgrabber/progress.py

Show diffs side-by-side

added added

removed removed

bzrlib/revfile.py

#! /usr/bin/env python

# based on an idea by Matt Mackall

# modified to squish into bzr by Martin Pool

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Packed file revision storage.

A Revfile holds the text history of a particular source file, such

as Makefile. It can represent a tree of text versions for that

file, allowing for microbranches within a single repository.

This is stored on disk as two files: an index file, and a data file.

The index file is short and always read completely into memory; the

data file is much longer and only the relevant bits of it,

identified by the index file, need to be read.

Each text version is identified by the SHA-1 of the full text of

that version. It also has a sequence number within the file.

The index file has a short header and then a sequence of fixed-length

records:

* byte[20] SHA-1 of text (as binary, not hex)

* uint32 sequence number this is based on, or -1 for full text

* uint32 flags: 1=zlib compressed

* uint32 offset in text file of start

* uint32 length of compressed delta in text file

* uint32[3] reserved

total 48 bytes.

The header is also 48 bytes for tidyness and easy calculation.

Both the index and the text are only ever appended to; a consequence

is that sequence numbers are stable references. But not every

repository in the world will assign the same sequence numbers,

therefore the SHA-1 is the only universally unique reference.

This is meant to scale to hold 100,000 revisions of a single file, by

which time the index file will be ~4.8MB and a bit big to read

sequentially.

Some of the reserved fields could be used to implement a (semi?)

balanced tree indexed by SHA1 so we can much more efficiently find the

index associated with a particular hash. For 100,000 revs we would be

able to find it in about 17 random reads, which is not too bad.

This performs pretty well except when trying to calculate deltas of

really large files. For that the main thing would be to plug in

something faster than difflib, which is after all pure Python.

Another approach is to just store the gzipped full text of big files,

though perhaps that's too perverse?

"""

# TODO: Something like pread() would make this slightly simpler and

# perhaps more efficient.

# TODO: Could also try to mmap things... Might be faster for the

# index in particular?

# TODO: Some kind of faster lookup of SHAs? The bad thing is that probably means

# rewriting existing records, which is not so nice.

# TODO: Something to check that regions identified in the index file

# completely butt up and do not overlap. Strictly it's not a problem

# if there are gaps and that can happen if we're interrupted while

# writing to the datafile. Overlapping would be very bad though.

import sys, zlib, struct, mdiff, stat, os, sha

from binascii import hexlify, unhexlify

factor = 10

_RECORDSIZE = 48

_HEADER = "bzr revfile v1\n"

_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))

_NO_RECORD = 0xFFFFFFFFL

100

# fields in the index record

101

I_SHA = 0

102

I_BASE = 1

103

I_FLAGS = 2

104

I_OFFSET = 3

105

I_LEN = 4

106

107

FL_GZIP = 1

108

109

# maximum number of patches in a row before recording a whole text.

110

CHAIN_LIMIT = 50

111

112

113

class RevfileError(Exception):

114

pass

115

116

class LimitHitException(Exception):

117

pass

118

119

class Revfile:

120

def __init__(self, basename):

121

# TODO: Option to open readonly

122

123

# TODO: Lock file while open

124

125

# TODO: advise of random access

126

127

self.basename = basename

128

129

idxname = basename + '.irev'

130

dataname = basename + '.drev'

131

132

idx_exists = os.path.exists(idxname)

133

data_exists = os.path.exists(dataname)

134

135

if idx_exists != data_exists:

136

raise RevfileError("half-assed revfile")

137

138

if not idx_exists:

139

self.idxfile = open(idxname, 'w+b')

140

self.datafile = open(dataname, 'w+b')

141

142

print 'init empty file'

143

self.idxfile.write(_HEADER)

144

self.idxfile.flush()

145

else:

146

self.idxfile = open(idxname, 'r+b')

147

self.datafile = open(dataname, 'r+b')

148

149

h = self.idxfile.read(_RECORDSIZE)

150

if h != _HEADER:

151

raise RevfileError("bad header %r in index of %r"

152

% (h, self.basename))

153

154

155

def _check_index(self, idx):

156

if idx < 0 or idx > len(self):

157

raise RevfileError("invalid index %r" % idx)

158

159

160

def find_sha(self, s):

161

assert isinstance(s, str)

162

assert len(s) == 20

163

164

for idx, idxrec in enumerate(self):

165

if idxrec[I_SHA] == s:

166

return idx

167

else:

168

return _NO_RECORD

169

170

171

172

def _add_compressed(self, text_sha, data, base, compress):

173

# well, maybe compress

174

flags = 0

175

if compress:

176

data_len = len(data)

177

if data_len > 50:

178

# don't do compression if it's too small; it's unlikely to win

179

# enough to be worthwhile

180

compr_data = zlib.compress(data)

181

compr_len = len(compr_data)

182

if compr_len < data_len:

183

data = compr_data

184

flags = FL_GZIP

185

##print '- compressed %d -> %d, %.1f%%' \

186

## % (data_len, compr_len, float(compr_len)/float(data_len) * 100.0)

187

return self._add_raw(text_sha, data, base, flags)

188

189

190

191

def _add_raw(self, text_sha, data, base, flags):

192

"""Add pre-processed data, can be either full text or delta.

193

194

This does the compression if that makes sense."""

195

idx = len(self)

196

self.datafile.seek(0, 2) # to end

197

self.idxfile.seek(0, 2)

198

assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)

199

data_offset = self.datafile.tell()

200

201

assert isinstance(data, str) # not unicode or anything wierd

202

203

self.datafile.write(data)

204

self.datafile.flush()

205

206

assert isinstance(text_sha, str)

207

entry = text_sha

208

entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))

209

assert len(entry) == _RECORDSIZE

210

211

self.idxfile.write(entry)

212

self.idxfile.flush()

213

214

return idx

215

216

217

218

def _add_full_text(self, text, text_sha, compress):

219

"""Add a full text to the file.

220

221

This is not compressed against any reference version.

222

223

Returns the index for that text."""

224

return self._add_compressed(text_sha, text, _NO_RECORD, compress)

225

226

227

def _add_delta(self, text, text_sha, base, compress):

228

"""Add a text stored relative to a previous text."""

229

self._check_index(base)

230

231

try:

232

base_text = self.get(base, recursion_limit=CHAIN_LIMIT)

233

except LimitHitException:

234

return self._add_full_text(text, text_sha, compress)

235

236

data = mdiff.bdiff(base_text, text)

237

238

# If the delta is larger than the text, we might as well just

239

# store the text. (OK, the delta might be more compressible,

240

# but the overhead of applying it probably still makes it

241

# bad, and I don't want to compress both of them to find out.)

242

if len(data) >= len(text):

243

return self._add_full_text(text, text_sha, compress)

244

else:

245

return self._add_compressed(text_sha, data, base, compress)

246

247

248

def add(self, text, base=_NO_RECORD, compress=True):

249

"""Add a new text to the revfile.

250

251

If the text is already present them its existing id is

252

returned and the file is not changed.

253

254

If compress is true then gzip compression will be used if it

255

reduces the size.

256

257

If a base index is specified, that text *may* be used for

258

delta compression of the new text. Delta compression will

259

only be used if it would be a size win and if the existing

260

base is not at too long of a delta chain already.

261

"""

262

text_sha = sha.new(text).digest()

263

264

idx = self.find_sha(text_sha)

265

if idx != _NO_RECORD:

266

# TODO: Optional paranoid mode where we read out that record and make sure

267

# it's the same, in case someone ever breaks SHA-1.

268

return idx # already present

269

270

if base == _NO_RECORD:

271

return self._add_full_text(text, text_sha, compress)

272

else:

273

return self._add_delta(text, text_sha, base, compress)

274

275

276

277

def get(self, idx, recursion_limit=None):

278

"""Retrieve text of a previous revision.

279

280

If recursion_limit is an integer then walk back at most that

281

many revisions and then raise LimitHitException, indicating

282

that we ought to record a new file text instead of another

283

delta. Don't use this when trying to get out an existing

284

revision."""

285

286

idxrec = self[idx]

287

base = idxrec[I_BASE]

288

if base == _NO_RECORD:

289

text = self._get_full_text(idx, idxrec)

290

else:

291

text = self._get_patched(idx, idxrec, recursion_limit)

292

293

if sha.new(text).digest() != idxrec[I_SHA]:

294

raise RevfileError("corrupt SHA-1 digest on record %d"

295

% idx)

296

297

return text

298

299

300

301

def _get_raw(self, idx, idxrec):

302

flags = idxrec[I_FLAGS]

303

if flags & ~FL_GZIP:

304

raise RevfileError("unsupported index flags %#x on index %d"

305

% (flags, idx))

306

307

l = idxrec[I_LEN]

308

if l == 0:

309

return ''

310

311

self.datafile.seek(idxrec[I_OFFSET])

312

313

data = self.datafile.read(l)

314

if len(data) != l:

315

raise RevfileError("short read %d of %d "

316

"getting text for record %d in %r"

317

% (len(data), l, idx, self.basename))

318

319

if flags & FL_GZIP:

320

data = zlib.decompress(data)

321

322

return data

323

324

325

def _get_full_text(self, idx, idxrec):

326

assert idxrec[I_BASE] == _NO_RECORD

327

328

text = self._get_raw(idx, idxrec)

329

330

return text

331

332

333

def _get_patched(self, idx, idxrec, recursion_limit):

334

base = idxrec[I_BASE]

335

assert base >= 0

336

assert base < idx # no loops!

337

338

if recursion_limit == None:

339

sub_limit = None

340

else:

341

sub_limit = recursion_limit - 1

342

if sub_limit < 0:

343

raise LimitHitException()

344

345

base_text = self.get(base, sub_limit)

346

patch = self._get_raw(idx, idxrec)

347

348

text = mdiff.bpatch(base_text, patch)

349

350

return text

351

352

353

354

def __len__(self):

355

"""Return number of revisions."""

356

l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]

357

if l % _RECORDSIZE:

358

raise RevfileError("bad length %d on index of %r" % (l, self.basename))

359

if l < _RECORDSIZE:

360

raise RevfileError("no header present in index of %r" % (self.basename))

361

return int(l / _RECORDSIZE) - 1

362

363

364

def __getitem__(self, idx):

365

"""Index by sequence id returns the index field"""

366

## TODO: Can avoid seek if we just moved there...

367

self._seek_index(idx)

368

return self._read_next_index()

369

370

371

def _seek_index(self, idx):

372

if idx < 0:

373

raise RevfileError("invalid index %r" % idx)

374

self.idxfile.seek((idx + 1) * _RECORDSIZE)

375

376

377

def _read_next_index(self):

378

rec = self.idxfile.read(_RECORDSIZE)

379

if not rec:

380

raise IndexError("end of index file")

381

elif len(rec) != _RECORDSIZE:

382

raise RevfileError("short read of %d bytes getting index %d from %r"

383

% (len(rec), idx, self.basename))

384

385

return struct.unpack(">20sIIII12x", rec)

386

387

388

def dump(self, f=sys.stdout):

389

f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n'

390

% tuple('idx sha1 base flags offset len'.split()))

391

f.write('-------- ---------------------------------------- ')

392

f.write('-------- -------- -------- --------\n')

393

394

for i, rec in enumerate(self):

395

f.write("#%-7d %40s " % (i, hexlify(rec[0])))

396

if rec[1] == _NO_RECORD:

397

f.write("(none) ")

398

else:

399

f.write("#%-7d " % rec[1])

400

401

f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))

402

403

404

def total_text_size(self):

405

"""Return the sum of sizes of all file texts.

406

407

This is how much space they would occupy if they were stored without

408

delta and gzip compression.

409

410

As a side effect this completely validates the Revfile, checking that all

411

texts can be reproduced with the correct SHA-1."""

412

t = 0L

413

for idx in range(len(self)):

414

t += len(self.get(idx))

415

return t

416

417

418

419

def main(argv):

420

r = Revfile("testrev")

421

422

try:

423

cmd = argv[1]

424

except IndexError:

425

sys.stderr.write("usage: revfile dump\n"

426

" revfile add\n"

427

" revfile add-delta BASE\n"

428

" revfile get IDX\n"

429

" revfile find-sha HEX\n"

430

" revfile total-text-size\n"

431

" revfile last\n")

432

return 1

433

434

if cmd == 'add':

435

new_idx = r.add(sys.stdin.read())

436

print new_idx

437

elif cmd == 'add-delta':

438

new_idx = r.add(sys.stdin.read(), int(argv[2]))

439

print new_idx

440

elif cmd == 'dump':

441

r.dump()

442

elif cmd == 'get':

443

try:

444

idx = int(argv[2])

445

except IndexError:

446

sys.stderr.write("usage: revfile get IDX\n")

447

return 1

448

449

if idx < 0 or idx >= len(r):

450

sys.stderr.write("invalid index %r\n" % idx)

451

return 1

452

453

sys.stdout.write(r.get(idx))

454

elif cmd == 'find-sha':

455

try:

456

s = unhexlify(argv[2])

457

except IndexError:

458

sys.stderr.write("usage: revfile find-sha HEX\n")

459

return 1

460

461

idx = r.find_sha(s)

462

if idx == _NO_RECORD:

463

sys.stderr.write("no such record\n")

464

return 1

465

else:

466

print idx

467

elif cmd == 'total-text-size':

468

print r.total_text_size()

469

elif cmd == 'last':

470

print len(r)-1

471

else:

472

sys.stderr.write("unknown command %r\n" % cmd)

473

return 1

474

475

476

if __name__ == '__main__':

477

import sys

478

sys.exit(main(sys.argv) or 0)

Older »