/brz/remove-bazaar : contents of bzrlib/hashcache.py at revision 864

: (revision 864)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# (C) 2005 Canonical Ltd

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA



# TODO: Perhaps have a way to stat all the files in inode order, and
# then remember that they're all fresh for the lifetime of the object?

# TODO: Keep track of whether there are in-memory updates that need to
# be flushed.

# TODO: Perhaps return more details on the file to avoid statting it
# again: nonexistent, file type, size, etc




CACHE_HEADER = "### bzr statcache v5\n"


def _fingerprint(abspath):
    import os, stat

    try:
        fs = os.lstat(abspath)
    except OSError:
        # might be missing, etc
        return None

    if stat.S_ISDIR(fs.st_mode):
        return None

    return (fs.st_size, fs.st_mtime,
            fs.st_ctime, fs.st_ino, fs.st_dev)


class HashCache(object):
    """Cache for looking up file SHA-1.

    Files are considered to match the cached value if the fingerprint
    of the file has not changed.  This includes its mtime, ctime,
    device number, inode number, and size.  This should catch
    modifications or replacement of the file by a new one.

    This may not catch modifications that do not change the file's
    size and that occur within the resolution window of the
    timestamps.  To handle this we specifically do not cache files
    which have changed since the start of the present second, since
    they could undetectably change again.

    This scheme may fail if the machine's clock steps backwards.
    Don't do that.

    This does not canonicalize the paths passed in; that should be
    done by the caller.

    _cache
        Indexed by path, points to a two-tuple of the SHA-1 of the file.
        and its fingerprint.

    stat_count
        number of times files have been statted

    hit_count
        number of times files have been retrieved from the cache, avoiding a
        re-read
        
    miss_count
        number of misses (times files have been completely re-read)
    """
    def __init__(self, basedir):
        self.basedir = basedir
        self.hit_count = 0
        self.miss_count = 0
        self.stat_count = 0
        self.danger_count = 0

        self._cache = {}


    def clear(self):
        """Discard all cached information.

        This does not reset the counters."""
        self._cache_sha1 = {}


    def get_sha1(self, path):
        """Return the hex SHA-1 of the contents of the file at path.

        XXX: If the file does not exist or is not a plain file???
        """

        import os, time
        from bzrlib.osutils import sha_file
        
        abspath = os.path.join(self.basedir, path)
        fp = _fingerprint(abspath)
        c = self._cache.get(path)
        if c:
            cache_sha1, cache_fp = c
        else:
            cache_sha1, cache_fp = None, None

        self.stat_count += 1

        if not fp:
            # not a regular file
            return None
        elif cache_fp and (cache_fp == fp):
            self.hit_count += 1
            return cache_sha1
        else:
            self.miss_count += 1
            digest = sha_file(file(abspath, 'rb'))

            now = int(time.time())
            if fp[1] >= now or fp[2] >= now:
                # changed too recently; can't be cached.  we can
                # return the result and it could possibly be cached
                # next time.
                self.danger_count += 1 
                if cache_fp:
                    del self._cache[path]
            else:
                self._cache[path] = (digest, fp)

            return digest



    def write(self, cachefn):
        """Write contents of cache to file."""
        from atomicfile import AtomicFile

        outf = AtomicFile(cachefn, 'wb')
        try:
            print >>outf, CACHE_HEADER,

            for path, c  in self._cache.iteritems():
                assert '//' not in path, path
                outf.write(path.encode('utf-8'))
                outf.write('// ')
                print >>outf, c[0],     # hex sha1
                for fld in c[1]:
                    print >>outf, "%d" % fld,
                print >>outf

            outf.commit()
        finally:
            if not outf.closed:
                outf.abort()
        


    def read(self, cachefn):
        """Reinstate cache from file.

        Overwrites existing cache.

        If the cache file has the wrong version marker, this just clears 
        the cache."""
        from bzrlib.trace import mutter, warning

        inf = file(cachefn, 'rb')
        self._cache = {}

        hdr = inf.readline()
        if hdr != CACHE_HEADER:
            mutter('cache header marker not found at top of %s; discarding cache'
                   % cachefn)
            return

        for l in inf:
            pos = l.index('// ')
            path = l[:pos].decode('utf-8')
            if path in self._cache:
                warning('duplicated path %r in cache' % path)
                continue

            pos += 3
            fields = l[pos:].split(' ')
            if len(fields) != 6:
                warning("bad line in hashcache: %r" % l)
                continue

            sha1 = fields[0]
            if len(sha1) != 40:
                warning("bad sha1 in hashcache: %r" % sha1)
                continue

            fp = tuple(map(long, fields[1:]))

            self._cache[path] = (sha1, fp)


        

846 by Martin Pool - start adding refactored/simplified hash cache	1	# (C) 2005 Canonical Ltd
	2
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17
	18
864 by Martin Pool doc	19	# TODO: Perhaps have a way to stat all the files in inode order, and
	20	# then remember that they're all fresh for the lifetime of the object?
	21
	22	# TODO: Keep track of whether there are in-memory updates that need to
	23	# be flushed.
	24
	25	# TODO: Perhaps return more details on the file to avoid statting it
	26	# again: nonexistent, file type, size, etc
	27
	28
	29
846 by Martin Pool - start adding refactored/simplified hash cache	30
862 by Martin Pool - code to re-read hashcache from file	31	CACHE_HEADER = "### bzr statcache v5\n"
859 by Martin Pool - add HashCache.write and a simple test for it	32
	33
846 by Martin Pool - start adding refactored/simplified hash cache	34	def _fingerprint(abspath):
	35	import os, stat
	36
	37	try:
	38	fs = os.lstat(abspath)
	39	except OSError:
	40	# might be missing, etc
	41	return None
	42
	43	if stat.S_ISDIR(fs.st_mode):
	44	return None
	45
	46	return (fs.st_size, fs.st_mtime,
	47	fs.st_ctime, fs.st_ino, fs.st_dev)
	48
	49
	50	class HashCache(object):
	51	"""Cache for looking up file SHA-1.
	52
	53	Files are considered to match the cached value if the fingerprint
	54	of the file has not changed. This includes its mtime, ctime,
	55	device number, inode number, and size. This should catch
	56	modifications or replacement of the file by a new one.
	57
	58	This may not catch modifications that do not change the file's
	59	size and that occur within the resolution window of the
	60	timestamps. To handle this we specifically do not cache files
	61	which have changed since the start of the present second, since
	62	they could undetectably change again.
	63
	64	This scheme may fail if the machine's clock steps backwards.
	65	Don't do that.
	66
	67	This does not canonicalize the paths passed in; that should be
	68	done by the caller.
	69
860 by Martin Pool - refactor hashcache to use just one dictionary	70	_cache
	71	Indexed by path, points to a two-tuple of the SHA-1 of the file.
	72	and its fingerprint.
846 by Martin Pool - start adding refactored/simplified hash cache	73
	74	stat_count
	75	number of times files have been statted
	76
	77	hit_count
	78	number of times files have been retrieved from the cache, avoiding a
	79	re-read
	80
	81	miss_count
	82	number of misses (times files have been completely re-read)
	83	"""
	84	def __init__(self, basedir):
	85	self.basedir = basedir
	86	self.hit_count = 0
	87	self.miss_count = 0
	88	self.stat_count = 0
	89	self.danger_count = 0
860 by Martin Pool - refactor hashcache to use just one dictionary	90
	91	self._cache = {}
846 by Martin Pool - start adding refactored/simplified hash cache	92
	93
	94	def clear(self):
860 by Martin Pool - refactor hashcache to use just one dictionary	95	"""Discard all cached information.
	96
	97	This does not reset the counters."""
	98	self._cache_sha1 = {}
846 by Martin Pool - start adding refactored/simplified hash cache	99
	100
	101	def get_sha1(self, path):
	102	"""Return the hex SHA-1 of the contents of the file at path.
	103
	104	XXX: If the file does not exist or is not a plain file???
	105	"""
	106
	107	import os, time
	108	from bzrlib.osutils import sha_file
	109
	110	abspath = os.path.join(self.basedir, path)
	111	fp = _fingerprint(abspath)
860 by Martin Pool - refactor hashcache to use just one dictionary	112	c = self._cache.get(path)
	113	if c:
	114	cache_sha1, cache_fp = c
	115	else:
	116	cache_sha1, cache_fp = None, None
846 by Martin Pool - start adding refactored/simplified hash cache	117
	118	self.stat_count += 1
	119
	120	if not fp:
	121	# not a regular file
	122	return None
	123	elif cache_fp and (cache_fp == fp):
	124	self.hit_count += 1
860 by Martin Pool - refactor hashcache to use just one dictionary	125	return cache_sha1
846 by Martin Pool - start adding refactored/simplified hash cache	126	else:
	127	self.miss_count += 1
	128	digest = sha_file(file(abspath, 'rb'))
	129
	130	now = int(time.time())
	131	if fp[1] >= now or fp[2] >= now:
	132	# changed too recently; can't be cached. we can
	133	# return the result and it could possibly be cached
	134	# next time.
	135	self.danger_count += 1
	136	if cache_fp:
860 by Martin Pool - refactor hashcache to use just one dictionary	137	del self._cache[path]
846 by Martin Pool - start adding refactored/simplified hash cache	138	else:
860 by Martin Pool - refactor hashcache to use just one dictionary	139	self._cache[path] = (digest, fp)
846 by Martin Pool - start adding refactored/simplified hash cache	140
	141	return digest
	142
859 by Martin Pool - add HashCache.write and a simple test for it	143
	144
	145	def write(self, cachefn):
	146	"""Write contents of cache to file."""
	147	from atomicfile import AtomicFile
	148
	149	outf = AtomicFile(cachefn, 'wb')
	150	try:
862 by Martin Pool - code to re-read hashcache from file	151	print >>outf, CACHE_HEADER,
859 by Martin Pool - add HashCache.write and a simple test for it	152
860 by Martin Pool - refactor hashcache to use just one dictionary	153	for path, c in self._cache.iteritems():
859 by Martin Pool - add HashCache.write and a simple test for it	154	assert '//' not in path, path
	155	outf.write(path.encode('utf-8'))
	156	outf.write('// ')
860 by Martin Pool - refactor hashcache to use just one dictionary	157	print >>outf, c[0], # hex sha1
	158	for fld in c[1]:
862 by Martin Pool - code to re-read hashcache from file	159	print >>outf, "%d" % fld,
859 by Martin Pool - add HashCache.write and a simple test for it	160	print >>outf
	161
	162	outf.commit()
	163	finally:
	164	if not outf.closed:
	165	outf.abort()
	166
862 by Martin Pool - code to re-read hashcache from file	167
	168
	169	def read(self, cachefn):
	170	"""Reinstate cache from file.
	171
	172	Overwrites existing cache.
	173
	174	If the cache file has the wrong version marker, this just clears
	175	the cache."""
	176	from bzrlib.trace import mutter, warning
	177
	178	inf = file(cachefn, 'rb')
	179	self._cache = {}
	180
	181	hdr = inf.readline()
	182	if hdr != CACHE_HEADER:
	183	mutter('cache header marker not found at top of %s; discarding cache'
	184	% cachefn)
	185	return
	186
	187	for l in inf:
	188	pos = l.index('// ')
	189	path = l[:pos].decode('utf-8')
	190	if path in self._cache:
	191	warning('duplicated path %r in cache' % path)
	192	continue
	193
	194	pos += 3
	195	fields = l[pos:].split(' ')
	196	if len(fields) != 6:
	197	warning("bad line in hashcache: %r" % l)
	198	continue
	199
	200	sha1 = fields[0]
	201	if len(sha1) != 40:
	202	warning("bad sha1 in hashcache: %r" % sha1)
	203	continue
	204
	205	fp = tuple(map(long, fields[1:]))
	206
	207	self._cache[path] = (sha1, fp)
	208
	209
	210