bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
| 
362
by Martin Pool
 - Import stat-cache code  | 
1  | 
# (C) 2005 Canonical Ltd
 | 
2  | 
||
3  | 
# This program is free software; you can redistribute it and/or modify
 | 
|
4  | 
# it under the terms of the GNU General Public License as published by
 | 
|
5  | 
# the Free Software Foundation; either version 2 of the License, or
 | 
|
6  | 
# (at your option) any later version.
 | 
|
7  | 
||
8  | 
# This program is distributed in the hope that it will be useful,
 | 
|
9  | 
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|
10  | 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
|
11  | 
# GNU General Public License for more details.
 | 
|
12  | 
||
13  | 
# You should have received a copy of the GNU General Public License
 | 
|
14  | 
# along with this program; if not, write to the Free Software
 | 
|
15  | 
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
|
16  | 
||
17  | 
import stat, os, sha, time  | 
|
18  | 
||
19  | 
from trace import mutter  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
20  | 
from errors import BzrError, BzrCheckError  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
21  | 
|
22  | 
||
| 
427
by Martin Pool
 - statcache docs  | 
23  | 
"""File stat cache to speed up tree comparisons.
 | 
24  | 
||
25  | 
This module basically gives a quick way to find the SHA-1 and related
 | 
|
26  | 
information of a file in the working directory, without actually
 | 
|
27  | 
reading and hashing the whole file.
 | 
|
28  | 
||
| 
509
by Martin Pool
 clean up stat cache code:  | 
29  | 
|
30  | 
||
31  | 
Implementation
 | 
|
32  | 
==============
 | 
|
33  | 
||
34  | 
Users of this module should not need to know about how this is
 | 
|
35  | 
implemented, and in particular should not depend on the particular
 | 
|
36  | 
data which is stored or its format.
 | 
|
37  | 
||
| 
427
by Martin Pool
 - statcache docs  | 
38  | 
This is done by maintaining a cache indexed by a file fingerprint of
 | 
39  | 
(path, size, mtime, ctime, ino, dev) pointing to the SHA-1.  If the
 | 
|
40  | 
fingerprint has changed, we assume the file content has not changed
 | 
|
41  | 
either and the SHA-1 is therefore the same.
 | 
|
42  | 
||
43  | 
If any of the fingerprint fields have changed then the file content
 | 
|
44  | 
*may* have changed, or it may not have.  We need to reread the file
 | 
|
45  | 
contents to make sure, but this is not visible to the user or
 | 
|
46  | 
higher-level code (except as a delay of course).
 | 
|
47  | 
||
48  | 
The mtime and ctime are stored with nanosecond fields, but not all
 | 
|
49  | 
filesystems give this level of precision.  There is therefore a
 | 
|
50  | 
possible race: the file might be modified twice within a second
 | 
|
51  | 
without changing the size or mtime, and a SHA-1 cached from the first
 | 
|
52  | 
version would be wrong.  We handle this by not recording a cached hash
 | 
|
53  | 
for any files which were modified in the current second and that
 | 
|
54  | 
therefore have the chance to change again before the second is up.
 | 
|
55  | 
||
56  | 
The only known hole in this design is if the system clock jumps
 | 
|
57  | 
backwards crossing invocations of bzr.  Please don't do that; use ntp
 | 
|
58  | 
to gradually adjust your clock or don't use bzr over the step.
 | 
|
59  | 
||
60  | 
At the moment this is stored in a simple textfile; it might be nice
 | 
|
61  | 
to use a tdb instead.
 | 
|
| 
434
by Martin Pool
 doc  | 
62  | 
|
63  | 
The cache is represented as a map from file_id to a tuple of (file_id,
 | 
|
64  | 
sha1, path, size, mtime, ctime, ino, dev).
 | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
65  | 
|
66  | 
The SHA-1 is stored in memory as a hexdigest.
 | 
|
67  | 
||
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
68  | 
File names and file-ids are written out with non-ascii or whitespace
 | 
69  | 
characters given as python-style unicode escapes.  (file-ids shouldn't
 | 
|
70  | 
contain wierd characters, but it might happen.)
 | 
|
| 
427
by Martin Pool
 - statcache docs  | 
71  | 
"""
 | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
72  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
73  | 
# order of fields returned by fingerprint()
 | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
74  | 
FP_SIZE = 0  | 
75  | 
FP_MTIME = 1  | 
|
76  | 
FP_CTIME = 2  | 
|
77  | 
FP_INO = 3  | 
|
78  | 
FP_DEV = 4  | 
|
79  | 
||
| 
509
by Martin Pool
 clean up stat cache code:  | 
80  | 
# order of fields in the statcache file and in the in-memory map
 | 
| 
437
by Martin Pool
 - new command 'bzr modified' to exercise the statcache  | 
81  | 
SC_FILE_ID = 0  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
82  | 
SC_SHA1 = 1  | 
83  | 
SC_PATH = 2  | 
|
84  | 
SC_SIZE = 3  | 
|
85  | 
SC_MTIME = 4  | 
|
86  | 
SC_CTIME = 5  | 
|
87  | 
SC_INO = 6  | 
|
88  | 
SC_DEV = 7  | 
|
| 
437
by Martin Pool
 - new command 'bzr modified' to exercise the statcache  | 
89  | 
|
90  | 
||
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
91  | 
|
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
92  | 
CACHE_HEADER = "### bzr statcache v3"  | 
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
93  | 
|
94  | 
||
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
95  | 
def fingerprint(abspath):  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
96  | 
try:  | 
97  | 
fs = os.lstat(abspath)  | 
|
98  | 
except OSError:  | 
|
99  | 
        # might be missing, etc
 | 
|
100  | 
return None  | 
|
101  | 
||
102  | 
if stat.S_ISDIR(fs.st_mode):  | 
|
103  | 
return None  | 
|
104  | 
||
105  | 
return (fs.st_size, fs.st_mtime,  | 
|
106  | 
fs.st_ctime, fs.st_ino, fs.st_dev)  | 
|
107  | 
||
108  | 
||
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
109  | 
|
110  | 
def safe_quote(s):  | 
|
111  | 
return s.encode('unicode_escape') \  | 
|
112  | 
.replace('\n', '\\u000a') \  | 
|
113  | 
.replace(' ', '\\u0020') \  | 
|
114  | 
.replace('\r', '\\u000d')  | 
|
115  | 
||
116  | 
||
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
117  | 
def _write_cache(basedir, entries):  | 
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
118  | 
from atomicfile import AtomicFile  | 
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
119  | 
|
120  | 
cachefn = os.path.join(basedir, '.bzr', 'stat-cache')  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
121  | 
outf = AtomicFile(cachefn, 'wb')  | 
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
122  | 
outf.write(CACHE_HEADER + '\n')  | 
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
123  | 
try:  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
124  | 
for entry in entries:  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
125  | 
if len(entry) != 8:  | 
126  | 
raise ValueError("invalid statcache entry tuple %r" % entry)  | 
|
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
127  | 
outf.write(safe_quote(entry[0])) # file id  | 
128  | 
outf.write(' ')  | 
|
129  | 
outf.write(entry[1]) # hex sha1  | 
|
130  | 
outf.write(' ')  | 
|
131  | 
outf.write(safe_quote(entry[2])) # name  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
132  | 
for nf in entry[3:]:  | 
133  | 
outf.write(' %d' % nf)  | 
|
134  | 
outf.write('\n')  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
135  | 
|
| 
428
by Martin Pool
 - Use AtomicFile to update statcache.  | 
136  | 
outf.commit()  | 
137  | 
finally:  | 
|
138  | 
if not outf.closed:  | 
|
139  | 
outf.abort()  | 
|
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
140  | 
|
141  | 
||
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
142  | 
def _try_write_cache(basedir, entries):  | 
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
143  | 
try:  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
144  | 
return _write_cache(basedir, entries)  | 
| 
551
by Martin Pool
 - Don't fail if unable to update the statcache  | 
145  | 
except IOError, e:  | 
146  | 
mutter("cannot update statcache in %s: %s" % (basedir, e))  | 
|
147  | 
except OSError, e:  | 
|
148  | 
mutter("cannot update statcache in %s: %s" % (basedir, e))  | 
|
149  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
150  | 
|
151  | 
||
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
152  | 
def load_cache(basedir):  | 
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
153  | 
import re  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
154  | 
cache = {}  | 
| 
540
by Martin Pool
 - use builtin set object in python2.4  | 
155  | 
seen_paths = {}  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
156  | 
|
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
157  | 
sha_re = re.compile(r'[a-f0-9]{40}')  | 
158  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
159  | 
try:  | 
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
160  | 
cachefn = os.path.join(basedir, '.bzr', 'stat-cache')  | 
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
161  | 
cachefile = open(cachefn, 'rb')  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
162  | 
except IOError:  | 
163  | 
return cache  | 
|
| 
524
by Martin Pool
 - add version marker at top of statcache  | 
164  | 
|
165  | 
line1 = cachefile.readline().rstrip('\r\n')  | 
|
166  | 
if line1 != CACHE_HEADER:  | 
|
167  | 
mutter('cache header marker not found at top of %s' % cachefn)  | 
|
168  | 
return cache  | 
|
169  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
170  | 
for l in cachefile:  | 
171  | 
f = l.split(' ')  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
172  | 
|
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
173  | 
file_id = f[0].decode('unicode_escape')  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
174  | 
if file_id in cache:  | 
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
175  | 
raise BzrCheckError("duplicated file_id in cache: {%s}" % file_id)  | 
176  | 
||
177  | 
text_sha = f[1]  | 
|
178  | 
if len(text_sha) != 40 or not sha_re.match(text_sha):  | 
|
179  | 
raise BzrCheckError("invalid file SHA-1 in cache: %r" % text_sha)  | 
|
| 
509
by Martin Pool
 clean up stat cache code:  | 
180  | 
|
| 
548
by Martin Pool
 - Write statcache using \u style encoding to avoid  | 
181  | 
path = f[2].decode('unicode_escape')  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
182  | 
if path in seen_paths:  | 
183  | 
raise BzrCheckError("duplicated path in cache: %r" % path)  | 
|
| 
540
by Martin Pool
 - use builtin set object in python2.4  | 
184  | 
seen_paths[path] = True  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
185  | 
|
| 
537
by Martin Pool
 - file-ids are stored as quoted-printable in the stat cache,  | 
186  | 
entry = (file_id, text_sha, path) + tuple([long(x) for x in f[3:]])  | 
| 
509
by Martin Pool
 clean up stat cache code:  | 
187  | 
if len(entry) != 8:  | 
188  | 
raise ValueError("invalid statcache entry tuple %r" % entry)  | 
|
189  | 
||
190  | 
cache[file_id] = entry  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
191  | 
return cache  | 
192  | 
||
193  | 
||
194  | 
||
195  | 
||
196  | 
def _files_from_inventory(inv):  | 
|
197  | 
for path, ie in inv.iter_entries():  | 
|
198  | 
if ie.kind != 'file':  | 
|
199  | 
            continue
 | 
|
200  | 
yield ie.file_id, path  | 
|
201  | 
||
202  | 
||
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
203  | 
|
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
204  | 
def update_cache(basedir, inv, flush=False):  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
205  | 
"""Update and return the cache for the branch.  | 
206  | 
||
207  | 
    The returned cache may contain entries that have not been written
 | 
|
208  | 
    to disk for files recently touched.
 | 
|
209  | 
||
210  | 
    flush -- discard any previous cache and recalculate from scratch.
 | 
|
211  | 
    """
 | 
|
212  | 
||
| 
554
by Martin Pool
 - clean up statcache code  | 
213  | 
    # load the existing cache; use information there to find a list of
 | 
214  | 
    # files ordered by inode, which is alleged to be the fastest order
 | 
|
215  | 
    # to stat the files.
 | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
216  | 
|
| 
554
by Martin Pool
 - clean up statcache code  | 
217  | 
to_update = _files_from_inventory(inv)  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
218  | 
|
| 
438
by Martin Pool
 - Avoid calling Inventory.iter_entries() when finding modified  | 
219  | 
assert isinstance(flush, bool)  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
220  | 
if flush:  | 
221  | 
cache = {}  | 
|
222  | 
else:  | 
|
| 
453
by Martin Pool
 - Split WorkingTree into its own file  | 
223  | 
cache = load_cache(basedir)  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
224  | 
|
225  | 
by_inode = []  | 
|
226  | 
without_inode = []  | 
|
227  | 
for file_id, path in to_update:  | 
|
228  | 
if file_id in cache:  | 
|
229  | 
by_inode.append((cache[file_id][SC_INO], file_id, path))  | 
|
230  | 
else:  | 
|
231  | 
without_inode.append((file_id, path))  | 
|
232  | 
by_inode.sort()  | 
|
233  | 
||
234  | 
to_update = [a[1:] for a in by_inode] + without_inode  | 
|
235  | 
||
236  | 
stat_cnt = missing_cnt = new_cnt = hardcheck = change_cnt = 0  | 
|
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
237  | 
|
| 
549
by Martin Pool
 doc  | 
238  | 
    # dangerfiles have been recently touched and can't be committed to
 | 
239  | 
    # a persistent cache yet, but they are returned to the caller.
 | 
|
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
240  | 
dangerfiles = []  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
241  | 
|
242  | 
now = int(time.time())  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
243  | 
|
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
244  | 
    ## mutter('update statcache under %r' % basedir)
 | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
245  | 
for file_id, path in to_update:  | 
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
246  | 
abspath = os.path.join(basedir, path)  | 
| 
458
by Martin Pool
 - fix statcache update from subdirectories  | 
247  | 
fp = fingerprint(abspath)  | 
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
248  | 
stat_cnt += 1  | 
249  | 
||
| 
362
by Martin Pool
 - Import stat-cache code  | 
250  | 
cacheentry = cache.get(file_id)  | 
251  | 
||
252  | 
if fp == None: # not here  | 
|
253  | 
if cacheentry:  | 
|
254  | 
del cache[file_id]  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
255  | 
change_cnt += 1  | 
256  | 
missing_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
257  | 
            continue
 | 
| 
554
by Martin Pool
 - clean up statcache code  | 
258  | 
elif not cacheentry:  | 
259  | 
new_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
260  | 
|
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
261  | 
if (fp[FP_MTIME] >= now) or (fp[FP_CTIME] >= now):  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
262  | 
dangerfiles.append(file_id)  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
263  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
264  | 
if cacheentry and (cacheentry[3:] == fp):  | 
265  | 
continue # all stat fields unchanged  | 
|
266  | 
||
267  | 
hardcheck += 1  | 
|
268  | 
||
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
269  | 
dig = sha.new(file(abspath, 'rb').read()).hexdigest()  | 
| 
362
by Martin Pool
 - Import stat-cache code  | 
270  | 
|
| 
486
by Martin Pool
 - Write out statcache if any files are updated, even  | 
271  | 
        # We update the cache even if the digest has not changed from
 | 
272  | 
        # last time we looked, so that the fingerprint fields will
 | 
|
273  | 
        # match in future.
 | 
|
274  | 
cacheentry = (file_id, dig, path) + fp  | 
|
275  | 
cache[file_id] = cacheentry  | 
|
276  | 
change_cnt += 1  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
277  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
278  | 
mutter('statcache: statted %d files, read %d files, %d changed, %d dangerous, '  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
279  | 
'%d deleted, %d new, '  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
280  | 
'%d in cache'  | 
| 
554
by Martin Pool
 - clean up statcache code  | 
281  | 
% (stat_cnt, hardcheck, change_cnt, len(dangerfiles),  | 
282  | 
missing_cnt, new_cnt, len(cache)))  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
283  | 
|
| 
457
by Martin Pool
 - more trace and profiling in statcache  | 
284  | 
if change_cnt:  | 
| 
436
by Martin Pool
 - Avoid dangerous files when writing out stat cache  | 
285  | 
mutter('updating on-disk statcache')  | 
| 
553
by Martin Pool
 - refactor handling of dangerous files (changed recently)  | 
286  | 
|
287  | 
if dangerfiles:  | 
|
288  | 
safe_cache = cache.copy()  | 
|
289  | 
for file_id in dangerfiles:  | 
|
290  | 
del safe_cache[file_id]  | 
|
291  | 
else:  | 
|
292  | 
safe_cache = cache  | 
|
293  | 
||
294  | 
_try_write_cache(basedir, safe_cache.itervalues())  | 
|
| 
362
by Martin Pool
 - Import stat-cache code  | 
295  | 
|
296  | 
return cache  |