/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""A manager of caches."""
18
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
19
import atexit
20
import os
21
import shutil
22
import tempfile
23
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
24
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
25
from bzrlib import lru_cache, trace
0.123.6 by Jelmer Vernooij
Split out reftracker.
26
from bzrlib.plugins.fastimport import (
27
    branch_mapper,
28
    )
0.123.3 by Jelmer Vernooij
Fix some imports.
29
from fastimport.helpers import (
30
    single_plural,
31
    )
0.64.279 by Jelmer Vernooij
Merge split of python-fastimport into a separate package.
32
from fastimport.reftracker import (
0.123.6 by Jelmer Vernooij
Split out reftracker.
33
    RefTracker,
34
    )
35
36
37
class _Cleanup(object):
38
    """This class makes sure we clean up when CacheManager goes away.
39
40
    We use a helper class to ensure that we are never in a refcycle.
41
    """
42
43
    def __init__(self, disk_blobs):
44
        self.disk_blobs = disk_blobs
45
        self.tempdir = None
46
        self.small_blobs = None
47
48
    def __del__(self):
49
        self.finalize()
50
51
    def finalize(self):
52
        if self.disk_blobs is not None:
53
            for info in self.disk_blobs.itervalues():
54
                if info[-1] is not None:
55
                    os.unlink(info[-1])
56
            self.disk_blobs = None
57
        if self.small_blobs is not None:
58
            self.small_blobs.close()
59
            self.small_blobs = None
60
        if self.tempdir is not None:
61
            shutil.rmtree(self.tempdir)
62
63
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
64
class CacheManager(object):
0.123.6 by Jelmer Vernooij
Split out reftracker.
65
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
66
    _small_blob_threshold = 25*1024
67
    _sticky_cache_size = 300*1024*1024
68
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
69
0.83.1 by Ian Clatworthy
head tracking tests and fix
70
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
71
        """Create a manager of caches.
72
73
        :param info: a ConfigObj holding the output from
74
            the --info processor, or None if no hints are available
75
        """
76
        self.verbose = verbose
77
78
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
79
        # Sticky blobs are referenced more than once, and are saved until their
80
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
81
        self._blobs = {}
82
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
83
        self._sticky_memory_bytes = 0
84
        # if we overflow our memory cache, then we will dump large blobs to
85
        # disk in this directory
86
        self._tempdir = None
87
        # id => (offset, n_bytes, fname)
88
        #   if fname is None, then the content is stored in the small file
89
        self._disk_blobs = {}
90
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
91
92
        # revision-id -> Inventory cache
93
        # these are large and we probably don't need too many as
94
        # most parents are recent in history
95
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
96
97
        # import commmit-ids -> revision-id lookup table
98
        # we need to keep all of these but they are small
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
99
        self.marks = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
100
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
101
        # (path, branch_ref) -> file-ids - as generated.
102
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
103
104
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
105
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
106
        if info is not None:
107
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
108
                blobs_by_counts = info['Blob reference counts']
109
                # The parser hands values back as lists, already parsed
110
                for count, blob_list in blobs_by_counts.items():
111
                    n = int(count)
112
                    for b in blob_list:
113
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
114
            except KeyError:
115
                # info not in file - possible when no blobs used
116
                pass
117
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
118
        # BranchMapper has no state (for now?), but we keep it around rather
119
        # than reinstantiate on every usage
120
        self.branch_mapper = branch_mapper.BranchMapper()
121
0.123.6 by Jelmer Vernooij
Split out reftracker.
122
        self.reftracker = RefTracker()
123
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
124
    def add_mark(self, mark, commit_id):
125
        assert mark[0] != ':'
126
        self.marks[mark] = commit_id
127
128
    def lookup_committish(self, committish):
129
        """Resolve a 'committish' to a revision id.
130
131
        :param committish: A "committish" string
132
        :return: Bazaar revision id
133
        """
134
        assert committish[0] == ':'
135
        return self.marks[committish.lstrip(':')]
136
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
137
    def dump_stats(self, note=trace.note):
138
        """Dump some statistics about what we cached."""
139
        # TODO: add in inventory stastistics
140
        note("Cache statistics:")
141
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
142
        self._show_stats_for(self.marks, "revision-ids", note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
143
        # These aren't interesting so omit from the output, at least for now
144
        #self._show_stats_for(self._blobs, "other blobs", note=note)
0.123.6 by Jelmer Vernooij
Split out reftracker.
145
        #self.reftracker.dump_stats(note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
146
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
147
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
148
        """Dump statistics about a given dictionary.
149
150
        By the key and value need to support len().
151
        """
152
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
153
        if tuple_key:
154
            size = sum(map(len, (''.join(k) for k in dict.keys())))
155
        else:
156
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
157
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
158
        size = size * 1.0 / 1024
159
        unit = 'K'
160
        if size > 1024:
161
            size = size / 1024
162
            unit = 'M'
163
            if size > 1024:
164
                size = size / 1024
165
                unit = 'G'
166
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.123.3 by Jelmer Vernooij
Fix some imports.
167
            single_plural(count, "item", "items")))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
168
169
    def clear_all(self):
170
        """Free up any memory used by the caches."""
171
        self._blobs.clear()
172
        self._sticky_blobs.clear()
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
173
        self.marks.clear()
0.123.6 by Jelmer Vernooij
Split out reftracker.
174
        self.reftracker.clear()
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
175
        self.inventories.clear()
176
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
177
    def _flush_blobs_to_disk(self):
178
        blobs = self._sticky_blobs.keys()
179
        sticky_blobs = self._sticky_blobs
180
        total_blobs = len(sticky_blobs)
181
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
182
        if self._tempdir is None:
0.123.3 by Jelmer Vernooij
Fix some imports.
183
            tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
184
            self._tempdir = tempdir
185
            self._cleanup.tempdir = self._tempdir
186
            self._cleanup.small_blobs = tempfile.TemporaryFile(
187
                prefix='small-blobs-', dir=self._tempdir)
188
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
189
            # Even though we add it to _Cleanup it seems that the object can be
190
            # destroyed 'too late' for cleanup to actually occur. Probably a
191
            # combination of bzr's "die directly, don't clean up" and how
192
            # exceptions close the running stack.
193
            def exit_cleanup():
194
                small_blob = small_blob_ref()
195
                if small_blob is not None:
196
                    small_blob.close()
197
                shutil.rmtree(tempdir, ignore_errors=True)
198
            atexit.register(exit_cleanup)
199
        count = 0
200
        bytes = 0
201
        n_small_bytes = 0
202
        while self._sticky_memory_bytes > self._sticky_flushed_size:
203
            id = blobs.pop()
204
            blob = self._sticky_blobs.pop(id)
205
            n_bytes = len(blob)
206
            self._sticky_memory_bytes -= n_bytes
207
            if n_bytes < self._small_blob_threshold:
208
                f = self._cleanup.small_blobs
209
                f.seek(0, os.SEEK_END)
210
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
211
                f.write(blob)
212
                n_small_bytes += n_bytes
213
            else:
214
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
215
                os.write(fd, blob)
216
                os.close(fd)
217
                self._disk_blobs[id] = (0, n_bytes, name)
218
            bytes += n_bytes
219
            del blob
220
            count += 1
221
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
222
                   % (count, total_blobs, bytes / 1024. / 1024,
223
                      n_small_bytes / 1024. / 1024))
224
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
225
    def store_blob(self, id, data):
226
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
227
        # Note: If we're not reference counting, everything has to be sticky
228
        if not self._blob_ref_counts or id in self._blob_ref_counts:
229
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
230
            self._sticky_memory_bytes += len(data)
231
            if self._sticky_memory_bytes > self._sticky_cache_size:
232
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
233
        elif data == '':
234
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
235
            self._sticky_blobs[id] = data
236
        else:
237
            self._blobs[id] = data
238
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
239
    def _decref(self, id, cache, fn):
240
        if not self._blob_ref_counts:
241
            return False
242
        count = self._blob_ref_counts.get(id, None)
243
        if count is not None:
244
            count -= 1
245
            if count <= 0:
246
                del cache[id]
247
                if fn is not None:
248
                    os.unlink(fn)
249
                del self._blob_ref_counts[id]
250
                return True
251
            else:
252
                self._blob_ref_counts[id] = count
253
        return False
254
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
255
    def fetch_blob(self, id):
256
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
257
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
258
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
259
        if id in self._disk_blobs:
260
            (offset, n_bytes, fn) = self._disk_blobs[id]
261
            if fn is None:
262
                f = self._cleanup.small_blobs
263
                f.seek(offset)
264
                content = f.read(n_bytes)
265
            else:
266
                fp = open(fn, 'rb')
267
                try:
268
                    content = fp.read()
269
                finally:
270
                    fp.close()
271
            self._decref(id, self._disk_blobs, fn)
272
            return content
273
        content = self._sticky_blobs[id]
274
        if self._decref(id, self._sticky_blobs, None):
275
            self._sticky_memory_bytes -= len(content)
276
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
277
0.123.6 by Jelmer Vernooij
Split out reftracker.
278