/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
19
import atexit
20
import os
21
import shutil
22
import tempfile
23
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
24
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
25
from bzrlib import lru_cache, trace
0.123.6 by Jelmer Vernooij
Split out reftracker.
26
from bzrlib.plugins.fastimport import (
27
    branch_mapper,
28
    )
0.123.3 by Jelmer Vernooij
Fix some imports.
29
from fastimport.helpers import (
30
    single_plural,
31
    )
0.64.279 by Jelmer Vernooij
Merge split of python-fastimport into a separate package.
32
from fastimport.reftracker import (
0.123.6 by Jelmer Vernooij
Split out reftracker.
33
    RefTracker,
34
    )
35
36
37
class _Cleanup(object):
38
    """This class makes sure we clean up when CacheManager goes away.
39
40
    We use a helper class to ensure that we are never in a refcycle.
41
    """
42
43
    def __init__(self, disk_blobs):
44
        self.disk_blobs = disk_blobs
45
        self.tempdir = None
46
        self.small_blobs = None
47
48
    def __del__(self):
49
        self.finalize()
50
51
    def finalize(self):
52
        if self.disk_blobs is not None:
53
            for info in self.disk_blobs.itervalues():
54
                if info[-1] is not None:
55
                    os.unlink(info[-1])
56
            self.disk_blobs = None
57
        if self.small_blobs is not None:
58
            self.small_blobs.close()
59
            self.small_blobs = None
60
        if self.tempdir is not None:
61
            shutil.rmtree(self.tempdir)
62
63
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
64
class CacheManager(object):
0.123.6 by Jelmer Vernooij
Split out reftracker.
65
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
66
    _small_blob_threshold = 25*1024
67
    _sticky_cache_size = 300*1024*1024
68
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
69
0.83.1 by Ian Clatworthy
head tracking tests and fix
70
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
71
        """Create a manager of caches.
72
73
        :param info: a ConfigObj holding the output from
74
            the --info processor, or None if no hints are available
75
        """
76
        self.verbose = verbose
77
78
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
79
        # Sticky blobs are referenced more than once, and are saved until their
80
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
81
        self._blobs = {}
82
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
83
        self._sticky_memory_bytes = 0
84
        # if we overflow our memory cache, then we will dump large blobs to
85
        # disk in this directory
86
        self._tempdir = None
87
        # id => (offset, n_bytes, fname)
88
        #   if fname is None, then the content is stored in the small file
89
        self._disk_blobs = {}
90
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
91
92
        # revision-id -> Inventory cache
93
        # these are large and we probably don't need too many as
94
        # most parents are recent in history
95
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
96
97
        # import commmit-ids -> revision-id lookup table
98
        # we need to keep all of these but they are small
99
        self.revision_ids = {}
100
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
101
        # (path, branch_ref) -> file-ids - as generated.
102
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
103
104
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
105
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
106
        if info is not None:
107
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
108
                blobs_by_counts = info['Blob reference counts']
109
                # The parser hands values back as lists, already parsed
110
                for count, blob_list in blobs_by_counts.items():
111
                    n = int(count)
112
                    for b in blob_list:
113
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
114
            except KeyError:
115
                # info not in file - possible when no blobs used
116
                pass
117
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
118
        # BranchMapper has no state (for now?), but we keep it around rather
119
        # than reinstantiate on every usage
120
        self.branch_mapper = branch_mapper.BranchMapper()
121
0.123.6 by Jelmer Vernooij
Split out reftracker.
122
        self.reftracker = RefTracker()
123
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
124
    def dump_stats(self, note=trace.note):
125
        """Dump some statistics about what we cached."""
126
        # TODO: add in inventory stastistics
127
        note("Cache statistics:")
128
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
129
        self._show_stats_for(self.revision_ids, "revision-ids", note=note)
130
        # These aren't interesting so omit from the output, at least for now
131
        #self._show_stats_for(self._blobs, "other blobs", note=note)
0.123.6 by Jelmer Vernooij
Split out reftracker.
132
        #self.reftracker.dump_stats(note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
133
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
134
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
135
        """Dump statistics about a given dictionary.
136
137
        By the key and value need to support len().
138
        """
139
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
140
        if tuple_key:
141
            size = sum(map(len, (''.join(k) for k in dict.keys())))
142
        else:
143
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
144
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
145
        size = size * 1.0 / 1024
146
        unit = 'K'
147
        if size > 1024:
148
            size = size / 1024
149
            unit = 'M'
150
            if size > 1024:
151
                size = size / 1024
152
                unit = 'G'
153
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.123.3 by Jelmer Vernooij
Fix some imports.
154
            single_plural(count, "item", "items")))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
155
156
    def clear_all(self):
157
        """Free up any memory used by the caches."""
158
        self._blobs.clear()
159
        self._sticky_blobs.clear()
160
        self.revision_ids.clear()
0.123.6 by Jelmer Vernooij
Split out reftracker.
161
        self.reftracker.clear()
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
162
        self.inventories.clear()
163
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
164
    def _flush_blobs_to_disk(self):
165
        blobs = self._sticky_blobs.keys()
166
        sticky_blobs = self._sticky_blobs
167
        total_blobs = len(sticky_blobs)
168
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
169
        if self._tempdir is None:
0.123.3 by Jelmer Vernooij
Fix some imports.
170
            tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
171
            self._tempdir = tempdir
172
            self._cleanup.tempdir = self._tempdir
173
            self._cleanup.small_blobs = tempfile.TemporaryFile(
174
                prefix='small-blobs-', dir=self._tempdir)
175
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
176
            # Even though we add it to _Cleanup it seems that the object can be
177
            # destroyed 'too late' for cleanup to actually occur. Probably a
178
            # combination of bzr's "die directly, don't clean up" and how
179
            # exceptions close the running stack.
180
            def exit_cleanup():
181
                small_blob = small_blob_ref()
182
                if small_blob is not None:
183
                    small_blob.close()
184
                shutil.rmtree(tempdir, ignore_errors=True)
185
            atexit.register(exit_cleanup)
186
        count = 0
187
        bytes = 0
188
        n_small_bytes = 0
189
        while self._sticky_memory_bytes > self._sticky_flushed_size:
190
            id = blobs.pop()
191
            blob = self._sticky_blobs.pop(id)
192
            n_bytes = len(blob)
193
            self._sticky_memory_bytes -= n_bytes
194
            if n_bytes < self._small_blob_threshold:
195
                f = self._cleanup.small_blobs
196
                f.seek(0, os.SEEK_END)
197
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
198
                f.write(blob)
199
                n_small_bytes += n_bytes
200
            else:
201
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
202
                os.write(fd, blob)
203
                os.close(fd)
204
                self._disk_blobs[id] = (0, n_bytes, name)
205
            bytes += n_bytes
206
            del blob
207
            count += 1
208
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
209
                   % (count, total_blobs, bytes / 1024. / 1024,
210
                      n_small_bytes / 1024. / 1024))
211
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
212
    def store_blob(self, id, data):
213
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
214
        # Note: If we're not reference counting, everything has to be sticky
215
        if not self._blob_ref_counts or id in self._blob_ref_counts:
216
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
217
            self._sticky_memory_bytes += len(data)
218
            if self._sticky_memory_bytes > self._sticky_cache_size:
219
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
220
        elif data == '':
221
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
222
            self._sticky_blobs[id] = data
223
        else:
224
            self._blobs[id] = data
225
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
226
    def _decref(self, id, cache, fn):
227
        if not self._blob_ref_counts:
228
            return False
229
        count = self._blob_ref_counts.get(id, None)
230
        if count is not None:
231
            count -= 1
232
            if count <= 0:
233
                del cache[id]
234
                if fn is not None:
235
                    os.unlink(fn)
236
                del self._blob_ref_counts[id]
237
                return True
238
            else:
239
                self._blob_ref_counts[id] = count
240
        return False
241
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
242
    def fetch_blob(self, id):
243
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
244
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
245
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
246
        if id in self._disk_blobs:
247
            (offset, n_bytes, fn) = self._disk_blobs[id]
248
            if fn is None:
249
                f = self._cleanup.small_blobs
250
                f.seek(offset)
251
                content = f.read(n_bytes)
252
            else:
253
                fp = open(fn, 'rb')
254
                try:
255
                    content = fp.read()
256
                finally:
257
                    fp.close()
258
            self._decref(id, self._disk_blobs, fn)
259
            return content
260
        content = self._sticky_blobs[id]
261
        if self._decref(id, self._sticky_blobs, None):
262
            self._sticky_memory_bytes -= len(content)
263
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
264
0.123.6 by Jelmer Vernooij
Split out reftracker.
265