/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
198 by mbp at sourcefrog
- experimental compressed Revfile support
1
#! /usr/bin/env python
2
3
# (C) 2005 Matt Mackall
4
5
# modified to squish into bzr by Martin Pool
6
7
# This program is free software; you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation; either version 2 of the License, or
10
# (at your option) any later version.
11
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
# GNU General Public License for more details.
16
17
# You should have received a copy of the GNU General Public License
18
# along with this program; if not, write to the Free Software
19
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20
21
22
"""Packed file revision storage.
23
24
A Revfile holds the text history of a particular source file, such
25
as Makefile.  It can represent a tree of text versions for that
26
file, allowing for microbranches within a single repository.
27
28
This is stored on disk as two files: an index file, and a data file.
29
The index file is short and always read completely into memory; the
30
data file is much longer and only the relevant bits of it,
31
identified by the index file, need to be read.
32
33
Each text version is identified by the SHA-1 of the full text of
34
that version.  It also has a sequence number within the file.
35
36
The index file has a short header and then a sequence of fixed-length
37
records:
38
39
* byte[20]    SHA-1 of text (as binary, not hex)
40
* uint32      sequence number this is based on, or -1 for full text
41
* uint32      flags: 1=zlib compressed
42
* uint32      offset in text file of start
43
* uint32      length of compressed delta in text file
44
* uint32[3]   reserved
45
46
total 48 bytes.
47
199 by mbp at sourcefrog
- use -1 for no_base in revfile
48
The header is also 48 bytes for tidyness and easy calculation.
198 by mbp at sourcefrog
- experimental compressed Revfile support
49
50
Both the index and the text are only ever appended to; a consequence
51
is that sequence numbers are stable references.  But not every
52
repository in the world will assign the same sequence numbers,
53
therefore the SHA-1 is the only universally unique reference.
54
55
This is meant to scale to hold 100,000 revisions of a single file, by
56
which time the index file will be ~4.8MB and a bit big to read
57
sequentially.
58
59
Some of the reserved fields could be used to implement a (semi?)
60
balanced tree indexed by SHA1 so we can much more efficiently find the
61
index associated with a particular hash.  For 100,000 revs we would be
62
able to find it in about 17 random reads, which is not too bad.
63
"""
64
 
65
66
67
import sys, zlib, struct, mdiff, stat, os, sha
68
from binascii import hexlify
69
70
factor = 10
71
72
_RECORDSIZE = 48
73
74
_HEADER = "bzr revfile v1\n"
75
_HEADER = _HEADER + ('\xff' * (_RECORDSIZE - len(_HEADER)))
199 by mbp at sourcefrog
- use -1 for no_base in revfile
76
_NO_BASE = 0xFFFFFFFFL
198 by mbp at sourcefrog
- experimental compressed Revfile support
77
78
class RevfileError(Exception):
79
    pass
80
81
class Revfile:
82
    def __init__(self, basename):
83
        self.basename = basename
84
        self.idxfile = open(basename + '.irev', 'r+b')
85
        self.datafile = open(basename + '.drev', 'r+b')
86
        if self.last_idx() == -1:
87
            print 'init empty file'
88
            self.idxfile.write(_HEADER)
89
            self.idxfile.flush()
90
        else:
91
            h = self.idxfile.read(_RECORDSIZE)
92
            if h != _HEADER:
93
                raise RevfileError("bad header %r in index of %r"
94
                                   % (h, self.basename))
95
        
96
    def last_idx(self):
97
        """Return last index already present, or -1 if none."""
98
        l = os.fstat(self.idxfile.fileno())[stat.ST_SIZE]
99
        if l == 0:
100
            return -1
101
        if l % _RECORDSIZE:
102
            raise RevfileError("bad length %d on index of %r" % (l, self.basename))
103
        return (l / _RECORDSIZE) - 1
104
105
106
    def revision(self, rev):
107
        base = self.index[rev][0]
108
        start = self.index[base][1]
109
        end = self.index[rev][1] + self.index[rev][2]
110
        f = open(self.datafile())
111
112
        f.seek(start)
113
        data = f.read(end - start)
114
115
        last = self.index[base][2]
116
        text = zlib.decompress(data[:last])
117
118
        for r in range(base + 1, rev + 1):
119
            s = self.index[r][2]
120
            b = zlib.decompress(data[last:last + s])
121
            text = mdiff.bpatch(text, b)
122
            last = last + s
123
124
        return text    
125
126
127
    def add_full_text(self, t):
128
        """Add a full text to the file.
129
130
        This is not compressed against any reference version.
131
132
        Returns the index for that text."""
133
        idx = self.last_idx() + 1
134
        self.datafile.seek(0, 2)        # to end
135
        self.idxfile.seek(0, 2)
136
        assert self.idxfile.tell() == _RECORDSIZE * idx
137
        data_offset = self.datafile.tell()
138
139
        assert isinstance(t, str) # not unicode or anything wierd
140
141
        self.datafile.write(t)
142
        self.datafile.flush()
143
144
        entry = sha.new(t).digest()
199 by mbp at sourcefrog
- use -1 for no_base in revfile
145
        entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t))
198 by mbp at sourcefrog
- experimental compressed Revfile support
146
        assert len(entry) == _RECORDSIZE
147
148
        self.idxfile.write(entry)
149
        self.idxfile.flush()
150
151
        return idx
152
153
154
    def __len__(self):
155
        return int(self.last_idx())
156
157
    def __getitem__(self, idx):
158
        self.idxfile.seek((idx + 1) * _RECORDSIZE)
159
        rec = self.idxfile.read(_RECORDSIZE)
160
        if len(rec) != _RECORDSIZE:
161
            raise RevfileError("short read of %d bytes getting index %d from %r"
162
                               % (len(rec), idx, self.basename))
199 by mbp at sourcefrog
- use -1 for no_base in revfile
163
        return struct.unpack(">20sIIII12x", rec)
198 by mbp at sourcefrog
- experimental compressed Revfile support
164
165
        
166
        
167
    def addrevision(self, text, changeset):
168
        t = self.tip()
169
        n = t + 1
170
171
        if not n % factor:
172
            data = zlib.compress(text)
173
            base = n
174
        else:
175
            prev = self.revision(t)
176
            data = zlib.compress(mdiff.bdiff(prev, text))
177
            base = self.index[t][0]
178
179
        offset = 0
180
        if t >= 0:
181
            offset = self.index[t][1] + self.index[t][2]
182
183
        self.index.append((base, offset, len(data), changeset))
184
        entry = struct.pack(">llll", base, offset, len(data), changeset)
185
186
        open(self.indexfile(), "a").write(entry)
187
        open(self.datafile(), "a").write(data)
188
199 by mbp at sourcefrog
- use -1 for no_base in revfile
189
    def dump(self, f=sys.stdout):
190
        f.write('%-8s %-40s %-8s %-8s %-8s %-8s\n' 
191
                % tuple('idx sha1 base flags offset len'.split()))
192
        f.write('-------- ---------------------------------------- ')
193
        f.write('-------- -------- -------- --------\n')
194
198 by mbp at sourcefrog
- experimental compressed Revfile support
195
        for i in range(len(self)):
196
            rec = self[i]
199 by mbp at sourcefrog
- use -1 for no_base in revfile
197
            f.write("#%-7d %40s " % (i, hexlify(rec[0])))
198
            if rec[1] == _NO_BASE:
199
                f.write("(none)   ")
200
            else:
201
                f.write("#%-7d " % rec[1])
202
                
203
            f.write("%8x %8d %8d\n" % (rec[2], rec[3], rec[4]))
198 by mbp at sourcefrog
- experimental compressed Revfile support
204
        
205
206
207
def main(argv):
208
    r = Revfile("testrev")
209
    if len(argv) < 2:
210
        sys.stderr.write("usage: revfile dump\n"
211
                         "       revfile add\n")
212
        sys.exit(1)
213
        
214
    if argv[1] == 'add':
215
        new_idx = r.add_full_text(sys.stdin.read())
216
        print 'added idx %d' % new_idx
217
    elif argv[1] == 'dump':
218
        r.dump()
219
    else:
220
        sys.stderr.write("unknown command %r\n" % argv[1])
221
        sys.exit(1)
222
    
223
224
if __name__ == '__main__':
225
    import sys
226
    main(sys.argv)