bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
1 |
#! /usr/bin/env python
|
2 |
||
3 |
# (C) 2005 Matt Mackall
|
|
4 |
||
5 |
# modified to squish into bzr by Martin Pool
|
|
6 |
||
7 |
# This program is free software; you can redistribute it and/or modify
|
|
8 |
# it under the terms of the GNU General Public License as published by
|
|
9 |
# the Free Software Foundation; either version 2 of the License, or
|
|
10 |
# (at your option) any later version.
|
|
11 |
||
12 |
# This program is distributed in the hope that it will be useful,
|
|
13 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 |
# GNU General Public License for more details.
|
|
16 |
||
17 |
# You should have received a copy of the GNU General Public License
|
|
18 |
# along with this program; if not, write to the Free Software
|
|
19 |
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
20 |
||
21 |
||
22 |
"""Packed file revision storage.
|
|
23 |
||
24 |
A Revfile holds the text history of a particular source file, such
|
|
25 |
as Makefile. It can represent a tree of text versions for that
|
|
26 |
file, allowing for microbranches within a single repository.
|
|
27 |
||
28 |
This is stored on disk as two files: an index file, and a data file.
|
|
29 |
The index file is short and always read completely into memory; the
|
|
30 |
data file is much longer and only the relevant bits of it,
|
|
31 |
identified by the index file, need to be read.
|
|
32 |
||
33 |
Each text version is identified by the SHA-1 of the full text of
|
|
34 |
that version. It also has a sequence number within the file.
|
|
35 |
||
36 |
The index file has a short header and then a sequence of fixed-length
|
|
37 |
records:
|
|
38 |
||
39 |
* byte[20] SHA-1 of text (as binary, not hex)
|
|
40 |
* uint32 sequence number this is based on, or -1 for full text
|
|
41 |
* uint32 flags: 1=zlib compressed
|
|
42 |
* uint32 offset in text file of start
|
|
43 |
* uint32 length of compressed delta in text file
|
|
44 |
* uint32[3] reserved
|
|
45 |
||
46 |
total 48 bytes.
|
|
47 |
||
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
48 |
The header is also 48 bytes for tidyness and easy calculation.
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
49 |
|
50 |
Both the index and the text are only ever appended to; a consequence
|
|
51 |
is that sequence numbers are stable references. But not every
|
|
52 |
repository in the world will assign the same sequence numbers,
|
|
53 |
therefore the SHA-1 is the only universally unique reference.
|
|
54 |
||
55 |
This is meant to scale to hold 100,000 revisions of a single file, by
|
|
56 |
which time the index file will be ~4.8MB and a bit big to read
|
|
57 |
sequentially.
|
|
58 |
||
59 |
Some of the reserved fields could be used to implement a (semi?)
|
|
60 |
balanced tree indexed by SHA1 so we can much more efficiently find the
|
|
61 |
index associated with a particular hash. For 100,000 revs we would be
|
|
62 |
able to find it in about 17 random reads, which is not too bad.
|
|
63 |
"""
|
|
64 |
||
65 |
||
66 |
||
67 |
import sys, zlib, struct, mdiff, stat, os, sha |
|
68 |
from binascii import hexlify |
|
69 |
||
70 |
factor = 10 |
|
71 |
||
72 |
_RECORDSIZE = 48 |
|
73 |
||
74 |
_HEADER = "bzr revfile v1\n" |
|
75 |
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER))) |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
76 |
_NO_BASE = 0xFFFFFFFFL |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
77 |
|
78 |
class RevfileError(Exception): |
|
79 |
pass
|
|
80 |
||
81 |
class Revfile: |
|
82 |
def __init__(self, basename): |
|
83 |
self.basename = basename |
|
84 |
self.idxfile = open(basename + '.irev', 'r+b') |
|
85 |
self.datafile = open(basename + '.drev', 'r+b') |
|
86 |
if self.last_idx() == -1: |
|
87 |
print 'init empty file' |
|
88 |
self.idxfile.write(_HEADER) |
|
89 |
self.idxfile.flush() |
|
90 |
else: |
|
91 |
h = self.idxfile.read(_RECORDSIZE) |
|
92 |
if h != _HEADER: |
|
93 |
raise RevfileError("bad header %r in index of %r" |
|
94 |
% (h, self.basename)) |
|
95 |
||
96 |
def last_idx(self): |
|
97 |
"""Return last index already present, or -1 if none.""" |
|
98 |
l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE] |
|
99 |
if l == 0: |
|
100 |
return -1 |
|
101 |
if l % _RECORDSIZE: |
|
102 |
raise RevfileError("bad length %d on index of %r" % (l, self.basename)) |
|
103 |
return (l / _RECORDSIZE) - 1 |
|
104 |
||
105 |
||
106 |
def revision(self, rev): |
|
107 |
base = self.index[rev][0] |
|
108 |
start = self.index[base][1] |
|
109 |
end = self.index[rev][1] + self.index[rev][2] |
|
110 |
f = open(self.datafile()) |
|
111 |
||
112 |
f.seek(start) |
|
113 |
data = f.read(end - start) |
|
114 |
||
115 |
last = self.index[base][2] |
|
116 |
text = zlib.decompress(data[:last]) |
|
117 |
||
118 |
for r in range(base + 1, rev + 1): |
|
119 |
s = self.index[r][2] |
|
120 |
b = zlib.decompress(data[last:last + s]) |
|
121 |
text = mdiff.bpatch(text, b) |
|
122 |
last = last + s |
|
123 |
||
124 |
return text |
|
125 |
||
126 |
||
127 |
def add_full_text(self, t): |
|
128 |
"""Add a full text to the file. |
|
129 |
||
130 |
This is not compressed against any reference version.
|
|
131 |
||
132 |
Returns the index for that text."""
|
|
133 |
idx = self.last_idx() + 1 |
|
134 |
self.datafile.seek(0, 2) # to end |
|
135 |
self.idxfile.seek(0, 2) |
|
136 |
assert self.idxfile.tell() == _RECORDSIZE * idx |
|
137 |
data_offset = self.datafile.tell() |
|
138 |
||
139 |
assert isinstance(t, str) # not unicode or anything wierd |
|
140 |
||
141 |
self.datafile.write(t) |
|
142 |
self.datafile.flush() |
|
143 |
||
144 |
entry = sha.new(t).digest() |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
145 |
entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t)) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
146 |
assert len(entry) == _RECORDSIZE |
147 |
||
148 |
self.idxfile.write(entry) |
|
149 |
self.idxfile.flush() |
|
150 |
||
151 |
return idx |
|
152 |
||
153 |
||
154 |
def __len__(self): |
|
155 |
return int(self.last_idx()) |
|
156 |
||
157 |
def __getitem__(self, idx): |
|
158 |
self.idxfile.seek((idx + 1) * _RECORDSIZE) |
|
159 |
rec = self.idxfile.read(_RECORDSIZE) |
|
160 |
if len(rec) != _RECORDSIZE: |
|
161 |
raise RevfileError("short read of %d bytes getting index %d from %r" |
|
162 |
% (len(rec), idx, self.basename)) |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
163 |
return struct.unpack(">20sIIII12x", rec) |
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
164 |
|
165 |
||
166 |
||
167 |
def addrevision(self, text, changeset): |
|
168 |
t = self.tip() |
|
169 |
n = t + 1 |
|
170 |
||
171 |
if not n % factor: |
|
172 |
data = zlib.compress(text) |
|
173 |
base = n |
|
174 |
else: |
|
175 |
prev = self.revision(t) |
|
176 |
data = zlib.compress(mdiff.bdiff(prev, text)) |
|
177 |
base = self.index[t][0] |
|
178 |
||
179 |
offset = 0 |
|
180 |
if t >= 0: |
|
181 |
offset = self.index[t][1] + self.index[t][2] |
|
182 |
||
183 |
self.index.append((base, offset, len(data), changeset)) |
|
184 |
entry = struct.pack(">llll", base, offset, len(data), changeset) |
|
185 |
||
186 |
open(self.indexfile(), "a").write(entry) |
|
187 |
open(self.datafile(), "a").write(data) |
|
188 |
||
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
189 |
def dump(self, f=sys.stdout): |
190 |
f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n' |
|
191 |
% tuple('idx sha1 base flags offset len'.split())) |
|
192 |
f.write('-------- ---------------------------------------- ') |
|
193 |
f.write('-------- -------- -------- --------\n') |
|
194 |
||
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
195 |
for i in range(len(self)): |
196 |
rec = self[i] |
|
|
199
by mbp at sourcefrog
- use -1 for no_base in revfile |
197 |
f.write("#%-7d %40s " % (i, hexlify(rec[0]))) |
198 |
if rec[1] == _NO_BASE: |
|
199 |
f.write("(none) ") |
|
200 |
else: |
|
201 |
f.write("#%-7d " % rec[1]) |
|
202 |
||
203 |
f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4])) |
|
|
198
by mbp at sourcefrog
- experimental compressed Revfile support |
204 |
|
205 |
||
206 |
||
207 |
def main(argv): |
|
208 |
r = Revfile("testrev") |
|
209 |
if len(argv) < 2: |
|
210 |
sys.stderr.write("usage: revfile dump\n" |
|
211 |
" revfile add\n") |
|
212 |
sys.exit(1) |
|
213 |
||
214 |
if argv[1] == 'add': |
|
215 |
new_idx = r.add_full_text(sys.stdin.read()) |
|
216 |
print 'added idx %d' % new_idx |
|
217 |
elif argv[1] == 'dump': |
|
218 |
r.dump() |
|
219 |
else: |
|
220 |
sys.stderr.write("unknown command %r\n" % argv[1]) |
|
221 |
sys.exit(1) |
|
222 |
||
223 |
||
224 |
if __name__ == '__main__': |
|
225 |
import sys |
|
226 |
main(sys.argv) |