/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
846 by Martin Pool
- start adding refactored/simplified hash cache
1
# (C) 2005 Canonical Ltd
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
18
864 by Martin Pool
doc
19
# TODO: Perhaps have a way to stat all the files in inode order, and
20
# then remember that they're all fresh for the lifetime of the object?
21
22
# TODO: Keep track of whether there are in-memory updates that need to
23
# be flushed.
24
25
# TODO: Perhaps return more details on the file to avoid statting it
26
# again: nonexistent, file type, size, etc
27
28
29
846 by Martin Pool
- start adding refactored/simplified hash cache
30
862 by Martin Pool
- code to re-read hashcache from file
31
CACHE_HEADER = "### bzr statcache v5\n"
859 by Martin Pool
- add HashCache.write and a simple test for it
32
33
846 by Martin Pool
- start adding refactored/simplified hash cache
34
def _fingerprint(abspath):
35
    import os, stat
36
37
    try:
38
        fs = os.lstat(abspath)
39
    except OSError:
40
        # might be missing, etc
41
        return None
42
43
    if stat.S_ISDIR(fs.st_mode):
44
        return None
45
46
    return (fs.st_size, fs.st_mtime,
47
            fs.st_ctime, fs.st_ino, fs.st_dev)
48
49
50
class HashCache(object):
51
    """Cache for looking up file SHA-1.
52
53
    Files are considered to match the cached value if the fingerprint
54
    of the file has not changed.  This includes its mtime, ctime,
55
    device number, inode number, and size.  This should catch
56
    modifications or replacement of the file by a new one.
57
58
    This may not catch modifications that do not change the file's
59
    size and that occur within the resolution window of the
60
    timestamps.  To handle this we specifically do not cache files
61
    which have changed since the start of the present second, since
62
    they could undetectably change again.
63
64
    This scheme may fail if the machine's clock steps backwards.
65
    Don't do that.
66
67
    This does not canonicalize the paths passed in; that should be
68
    done by the caller.
69
860 by Martin Pool
- refactor hashcache to use just one dictionary
70
    _cache
71
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
72
        and its fingerprint.
846 by Martin Pool
- start adding refactored/simplified hash cache
73
74
    stat_count
75
        number of times files have been statted
76
77
    hit_count
78
        number of times files have been retrieved from the cache, avoiding a
79
        re-read
80
        
81
    miss_count
82
        number of misses (times files have been completely re-read)
83
    """
84
    def __init__(self, basedir):
85
        self.basedir = basedir
86
        self.hit_count = 0
87
        self.miss_count = 0
88
        self.stat_count = 0
89
        self.danger_count = 0
860 by Martin Pool
- refactor hashcache to use just one dictionary
90
91
        self._cache = {}
846 by Martin Pool
- start adding refactored/simplified hash cache
92
93
94
    def clear(self):
860 by Martin Pool
- refactor hashcache to use just one dictionary
95
        """Discard all cached information.
96
97
        This does not reset the counters."""
98
        self._cache_sha1 = {}
846 by Martin Pool
- start adding refactored/simplified hash cache
99
100
101
    def get_sha1(self, path):
102
        """Return the hex SHA-1 of the contents of the file at path.
103
104
        XXX: If the file does not exist or is not a plain file???
105
        """
106
107
        import os, time
108
        from bzrlib.osutils import sha_file
109
        
110
        abspath = os.path.join(self.basedir, path)
111
        fp = _fingerprint(abspath)
860 by Martin Pool
- refactor hashcache to use just one dictionary
112
        c = self._cache.get(path)
113
        if c:
114
            cache_sha1, cache_fp = c
115
        else:
116
            cache_sha1, cache_fp = None, None
846 by Martin Pool
- start adding refactored/simplified hash cache
117
118
        self.stat_count += 1
119
120
        if not fp:
121
            # not a regular file
122
            return None
123
        elif cache_fp and (cache_fp == fp):
124
            self.hit_count += 1
860 by Martin Pool
- refactor hashcache to use just one dictionary
125
            return cache_sha1
846 by Martin Pool
- start adding refactored/simplified hash cache
126
        else:
127
            self.miss_count += 1
128
            digest = sha_file(file(abspath, 'rb'))
129
130
            now = int(time.time())
131
            if fp[1] >= now or fp[2] >= now:
132
                # changed too recently; can't be cached.  we can
133
                # return the result and it could possibly be cached
134
                # next time.
135
                self.danger_count += 1 
136
                if cache_fp:
860 by Martin Pool
- refactor hashcache to use just one dictionary
137
                    del self._cache[path]
846 by Martin Pool
- start adding refactored/simplified hash cache
138
            else:
860 by Martin Pool
- refactor hashcache to use just one dictionary
139
                self._cache[path] = (digest, fp)
846 by Martin Pool
- start adding refactored/simplified hash cache
140
141
            return digest
142
859 by Martin Pool
- add HashCache.write and a simple test for it
143
144
145
    def write(self, cachefn):
146
        """Write contents of cache to file."""
147
        from atomicfile import AtomicFile
148
149
        outf = AtomicFile(cachefn, 'wb')
150
        try:
862 by Martin Pool
- code to re-read hashcache from file
151
            print >>outf, CACHE_HEADER,
859 by Martin Pool
- add HashCache.write and a simple test for it
152
860 by Martin Pool
- refactor hashcache to use just one dictionary
153
            for path, c  in self._cache.iteritems():
859 by Martin Pool
- add HashCache.write and a simple test for it
154
                assert '//' not in path, path
155
                outf.write(path.encode('utf-8'))
156
                outf.write('// ')
860 by Martin Pool
- refactor hashcache to use just one dictionary
157
                print >>outf, c[0],     # hex sha1
158
                for fld in c[1]:
862 by Martin Pool
- code to re-read hashcache from file
159
                    print >>outf, "%d" % fld,
859 by Martin Pool
- add HashCache.write and a simple test for it
160
                print >>outf
161
162
            outf.commit()
163
        finally:
164
            if not outf.closed:
165
                outf.abort()
166
        
862 by Martin Pool
- code to re-read hashcache from file
167
168
169
    def read(self, cachefn):
170
        """Reinstate cache from file.
171
172
        Overwrites existing cache.
173
174
        If the cache file has the wrong version marker, this just clears 
175
        the cache."""
176
        from bzrlib.trace import mutter, warning
177
178
        inf = file(cachefn, 'rb')
179
        self._cache = {}
180
181
        hdr = inf.readline()
182
        if hdr != CACHE_HEADER:
183
            mutter('cache header marker not found at top of %s; discarding cache'
184
                   % cachefn)
185
            return
186
187
        for l in inf:
188
            pos = l.index('// ')
189
            path = l[:pos].decode('utf-8')
190
            if path in self._cache:
191
                warning('duplicated path %r in cache' % path)
192
                continue
193
194
            pos += 3
195
            fields = l[pos:].split(' ')
196
            if len(fields) != 6:
197
                warning("bad line in hashcache: %r" % l)
198
                continue
199
200
            sha1 = fields[0]
201
            if len(sha1) != 40:
202
                warning("bad sha1 in hashcache: %r" % sha1)
203
                continue
204
205
            fp = tuple(map(long, fields[1:]))
206
207
            self._cache[path] = (sha1, fp)
208
209
210