/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
0.64.334 by Jelmer Vernooij
Remove old FSF address. Thanks Dan Callaghan.
14
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
15
16
"""A manager of caches."""
17
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
18
import atexit
19
import os
20
import shutil
21
import tempfile
22
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
23
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
24
from bzrlib import lru_cache, trace
0.123.6 by Jelmer Vernooij
Split out reftracker.
25
from bzrlib.plugins.fastimport import (
26
    branch_mapper,
27
    )
0.64.349 by Jelmer Vernooij
Reimport some modules removed from python-fastimport 0.9.2.
28
from bzrlib.plugins.fastimport.reftracker import (
29
    RefTracker,
30
    )
0.123.3 by Jelmer Vernooij
Fix some imports.
31
from fastimport.helpers import (
32
    single_plural,
33
    )
0.123.6 by Jelmer Vernooij
Split out reftracker.
34
35
36
class _Cleanup(object):
37
    """This class makes sure we clean up when CacheManager goes away.
38
39
    We use a helper class to ensure that we are never in a refcycle.
40
    """
41
42
    def __init__(self, disk_blobs):
43
        self.disk_blobs = disk_blobs
44
        self.tempdir = None
45
        self.small_blobs = None
46
47
    def __del__(self):
48
        self.finalize()
49
50
    def finalize(self):
51
        if self.disk_blobs is not None:
52
            for info in self.disk_blobs.itervalues():
53
                if info[-1] is not None:
54
                    os.unlink(info[-1])
55
            self.disk_blobs = None
56
        if self.small_blobs is not None:
57
            self.small_blobs.close()
58
            self.small_blobs = None
59
        if self.tempdir is not None:
60
            shutil.rmtree(self.tempdir)
61
62
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
63
class CacheManager(object):
0.123.6 by Jelmer Vernooij
Split out reftracker.
64
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
65
    _small_blob_threshold = 25*1024
66
    _sticky_cache_size = 300*1024*1024
67
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
68
0.83.1 by Ian Clatworthy
head tracking tests and fix
69
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
70
        """Create a manager of caches.
71
72
        :param info: a ConfigObj holding the output from
73
            the --info processor, or None if no hints are available
74
        """
75
        self.verbose = verbose
76
77
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
78
        # Sticky blobs are referenced more than once, and are saved until their
79
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
80
        self._blobs = {}
81
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
82
        self._sticky_memory_bytes = 0
83
        # if we overflow our memory cache, then we will dump large blobs to
84
        # disk in this directory
85
        self._tempdir = None
86
        # id => (offset, n_bytes, fname)
87
        #   if fname is None, then the content is stored in the small file
88
        self._disk_blobs = {}
89
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
90
91
        # revision-id -> Inventory cache
92
        # these are large and we probably don't need too many as
93
        # most parents are recent in history
94
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
95
96
        # import commmit-ids -> revision-id lookup table
97
        # we need to keep all of these but they are small
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
98
        self.marks = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
99
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
100
        # (path, branch_ref) -> file-ids - as generated.
101
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
102
103
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
104
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
105
        if info is not None:
106
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
107
                blobs_by_counts = info['Blob reference counts']
108
                # The parser hands values back as lists, already parsed
109
                for count, blob_list in blobs_by_counts.items():
110
                    n = int(count)
111
                    for b in blob_list:
112
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
113
            except KeyError:
114
                # info not in file - possible when no blobs used
115
                pass
116
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
117
        # BranchMapper has no state (for now?), but we keep it around rather
118
        # than reinstantiate on every usage
119
        self.branch_mapper = branch_mapper.BranchMapper()
120
0.123.6 by Jelmer Vernooij
Split out reftracker.
121
        self.reftracker = RefTracker()
122
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
123
    def add_mark(self, mark, commit_id):
124
        assert mark[0] != ':'
125
        self.marks[mark] = commit_id
126
127
    def lookup_committish(self, committish):
128
        """Resolve a 'committish' to a revision id.
129
130
        :param committish: A "committish" string
131
        :return: Bazaar revision id
132
        """
133
        assert committish[0] == ':'
134
        return self.marks[committish.lstrip(':')]
135
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
136
    def dump_stats(self, note=trace.note):
137
        """Dump some statistics about what we cached."""
138
        # TODO: add in inventory stastistics
139
        note("Cache statistics:")
140
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
141
        self._show_stats_for(self.marks, "revision-ids", note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
142
        # These aren't interesting so omit from the output, at least for now
143
        #self._show_stats_for(self._blobs, "other blobs", note=note)
0.123.6 by Jelmer Vernooij
Split out reftracker.
144
        #self.reftracker.dump_stats(note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
145
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
146
    def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
147
        """Dump statistics about a given dictionary.
148
149
        By the key and value need to support len().
150
        """
151
        count = len(dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
152
        if tuple_key:
153
            size = sum(map(len, (''.join(k) for k in dict.keys())))
154
        else:
155
            size = sum(map(len, dict.keys()))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
156
        size += sum(map(len, dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
157
        size = size * 1.0 / 1024
158
        unit = 'K'
159
        if size > 1024:
160
            size = size / 1024
161
            unit = 'M'
162
            if size > 1024:
163
                size = size / 1024
164
                unit = 'G'
165
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.123.3 by Jelmer Vernooij
Fix some imports.
166
            single_plural(count, "item", "items")))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
167
168
    def clear_all(self):
169
        """Free up any memory used by the caches."""
170
        self._blobs.clear()
171
        self._sticky_blobs.clear()
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
172
        self.marks.clear()
0.123.6 by Jelmer Vernooij
Split out reftracker.
173
        self.reftracker.clear()
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
174
        self.inventories.clear()
175
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
176
    def _flush_blobs_to_disk(self):
177
        blobs = self._sticky_blobs.keys()
178
        sticky_blobs = self._sticky_blobs
179
        total_blobs = len(sticky_blobs)
180
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
181
        if self._tempdir is None:
0.123.3 by Jelmer Vernooij
Fix some imports.
182
            tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
183
            self._tempdir = tempdir
184
            self._cleanup.tempdir = self._tempdir
185
            self._cleanup.small_blobs = tempfile.TemporaryFile(
186
                prefix='small-blobs-', dir=self._tempdir)
187
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
188
            # Even though we add it to _Cleanup it seems that the object can be
189
            # destroyed 'too late' for cleanup to actually occur. Probably a
190
            # combination of bzr's "die directly, don't clean up" and how
191
            # exceptions close the running stack.
192
            def exit_cleanup():
193
                small_blob = small_blob_ref()
194
                if small_blob is not None:
195
                    small_blob.close()
196
                shutil.rmtree(tempdir, ignore_errors=True)
197
            atexit.register(exit_cleanup)
198
        count = 0
199
        bytes = 0
200
        n_small_bytes = 0
201
        while self._sticky_memory_bytes > self._sticky_flushed_size:
202
            id = blobs.pop()
203
            blob = self._sticky_blobs.pop(id)
204
            n_bytes = len(blob)
205
            self._sticky_memory_bytes -= n_bytes
206
            if n_bytes < self._small_blob_threshold:
207
                f = self._cleanup.small_blobs
208
                f.seek(0, os.SEEK_END)
209
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
210
                f.write(blob)
211
                n_small_bytes += n_bytes
212
            else:
213
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
214
                os.write(fd, blob)
215
                os.close(fd)
216
                self._disk_blobs[id] = (0, n_bytes, name)
217
            bytes += n_bytes
218
            del blob
219
            count += 1
220
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
221
                   % (count, total_blobs, bytes / 1024. / 1024,
222
                      n_small_bytes / 1024. / 1024))
223
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
224
    def store_blob(self, id, data):
225
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
226
        # Note: If we're not reference counting, everything has to be sticky
227
        if not self._blob_ref_counts or id in self._blob_ref_counts:
228
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
229
            self._sticky_memory_bytes += len(data)
230
            if self._sticky_memory_bytes > self._sticky_cache_size:
231
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
232
        elif data == '':
233
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
234
            self._sticky_blobs[id] = data
235
        else:
236
            self._blobs[id] = data
237
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
238
    def _decref(self, id, cache, fn):
239
        if not self._blob_ref_counts:
240
            return False
241
        count = self._blob_ref_counts.get(id, None)
242
        if count is not None:
243
            count -= 1
244
            if count <= 0:
245
                del cache[id]
246
                if fn is not None:
247
                    os.unlink(fn)
248
                del self._blob_ref_counts[id]
249
                return True
250
            else:
251
                self._blob_ref_counts[id] = count
252
        return False
253
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
254
    def fetch_blob(self, id):
255
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
256
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
257
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
258
        if id in self._disk_blobs:
259
            (offset, n_bytes, fn) = self._disk_blobs[id]
260
            if fn is None:
261
                f = self._cleanup.small_blobs
262
                f.seek(offset)
263
                content = f.read(n_bytes)
264
            else:
265
                fp = open(fn, 'rb')
266
                try:
267
                    content = fp.read()
268
                finally:
269
                    fp.close()
270
            self._decref(id, self._disk_blobs, fn)
271
            return content
272
        content = self._sticky_blobs[id]
273
        if self._decref(id, self._sticky_blobs, None):
274
            self._sticky_memory_bytes -= len(content)
275
        return content
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
276
0.123.6 by Jelmer Vernooij
Split out reftracker.
277