/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
1
# Copyright (C) 2009 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
0.64.334 by Jelmer Vernooij
Remove old FSF address. Thanks Dan Callaghan.
14
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
15
16
"""A manager of caches."""
17
6628.1.2 by Jelmer Vernooij
Fix imports, move exporter.py, drop explorer metadata.
18
from __future__ import absolute_import
19
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
20
import atexit
21
import os
22
import shutil
23
import tempfile
24
import weakref
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
25
6628.1.2 by Jelmer Vernooij
Fix imports, move exporter.py, drop explorer metadata.
26
from ... import lru_cache, trace
27
from . import (
0.123.6 by Jelmer Vernooij
Split out reftracker.
28
    branch_mapper,
29
    )
6628.1.2 by Jelmer Vernooij
Fix imports, move exporter.py, drop explorer metadata.
30
from .reftracker import (
0.64.349 by Jelmer Vernooij
Reimport some modules removed from python-fastimport 0.9.2.
31
    RefTracker,
32
    )
6628.1.2 by Jelmer Vernooij
Fix imports, move exporter.py, drop explorer metadata.
33
from .helpers import (
0.123.3 by Jelmer Vernooij
Fix some imports.
34
    single_plural,
35
    )
0.123.6 by Jelmer Vernooij
Split out reftracker.
36
37
38
class _Cleanup(object):
39
    """This class makes sure we clean up when CacheManager goes away.
40
41
    We use a helper class to ensure that we are never in a refcycle.
42
    """
43
44
    def __init__(self, disk_blobs):
45
        self.disk_blobs = disk_blobs
46
        self.tempdir = None
47
        self.small_blobs = None
48
49
    def __del__(self):
50
        self.finalize()
51
52
    def finalize(self):
53
        if self.disk_blobs is not None:
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
54
            for info in self.disk_blobs.values():
0.123.6 by Jelmer Vernooij
Split out reftracker.
55
                if info[-1] is not None:
56
                    os.unlink(info[-1])
57
            self.disk_blobs = None
58
        if self.small_blobs is not None:
59
            self.small_blobs.close()
60
            self.small_blobs = None
61
        if self.tempdir is not None:
62
            shutil.rmtree(self.tempdir)
63
64
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
65
class CacheManager(object):
0.123.6 by Jelmer Vernooij
Split out reftracker.
66
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
67
    _small_blob_threshold = 25*1024
68
    _sticky_cache_size = 300*1024*1024
69
    _sticky_flushed_size = 100*1024*1024
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
70
0.83.1 by Ian Clatworthy
head tracking tests and fix
71
    def __init__(self, info=None, verbose=False, inventory_cache_size=10):
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
72
        """Create a manager of caches.
73
74
        :param info: a ConfigObj holding the output from
75
            the --info processor, or None if no hints are available
76
        """
77
        self.verbose = verbose
78
79
        # dataref -> data. datref is either :mark or the sha-1.
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
80
        # Sticky blobs are referenced more than once, and are saved until their
81
        # refcount goes to 0
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
82
        self._blobs = {}
83
        self._sticky_blobs = {}
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
84
        self._sticky_memory_bytes = 0
85
        # if we overflow our memory cache, then we will dump large blobs to
86
        # disk in this directory
87
        self._tempdir = None
88
        # id => (offset, n_bytes, fname)
89
        #   if fname is None, then the content is stored in the small file
90
        self._disk_blobs = {}
91
        self._cleanup = _Cleanup(self._disk_blobs)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
92
93
        # revision-id -> Inventory cache
94
        # these are large and we probably don't need too many as
95
        # most parents are recent in history
96
        self.inventories = lru_cache.LRUCache(inventory_cache_size)
97
98
        # import commmit-ids -> revision-id lookup table
99
        # we need to keep all of these but they are small
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
100
        self.marks = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
101
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
102
        # (path, branch_ref) -> file-ids - as generated.
103
        # (Use store_file_id/fetch_fileid methods rather than direct access.)
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
104
105
        # Work out the blobs to make sticky - None means all
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
106
        self._blob_ref_counts = {}
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
107
        if info is not None:
108
            try:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
109
                blobs_by_counts = info['Blob reference counts']
110
                # The parser hands values back as lists, already parsed
111
                for count, blob_list in blobs_by_counts.items():
112
                    n = int(count)
113
                    for b in blob_list:
114
                        self._blob_ref_counts[b] = n
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
115
            except KeyError:
116
                # info not in file - possible when no blobs used
117
                pass
118
0.112.4 by Max Bowsher
Store the BranchMapper in the CacheManager so it can be got from other places.
119
        # BranchMapper has no state (for now?), but we keep it around rather
120
        # than reinstantiate on every usage
121
        self.branch_mapper = branch_mapper.BranchMapper()
122
0.123.6 by Jelmer Vernooij
Split out reftracker.
123
        self.reftracker = RefTracker()
124
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
125
    def add_mark(self, mark, commit_id):
126
        assert mark[0] != ':'
6846.3.1 by Jelmer Vernooij
Support '0' marker in fastimport plugin.
127
        is_new = (mark in self.marks)
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
128
        self.marks[mark] = commit_id
6846.3.1 by Jelmer Vernooij
Support '0' marker in fastimport plugin.
129
        return is_new
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
130
131
    def lookup_committish(self, committish):
132
        """Resolve a 'committish' to a revision id.
133
134
        :param committish: A "committish" string
135
        :return: Bazaar revision id
136
        """
137
        assert committish[0] == ':'
138
        return self.marks[committish.lstrip(':')]
139
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
140
    def dump_stats(self, note=trace.note):
141
        """Dump some statistics about what we cached."""
142
        # TODO: add in inventory stastistics
143
        note("Cache statistics:")
144
        self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
145
        self._show_stats_for(self.marks, "revision-ids", note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
146
        # These aren't interesting so omit from the output, at least for now
147
        #self._show_stats_for(self._blobs, "other blobs", note=note)
0.123.6 by Jelmer Vernooij
Split out reftracker.
148
        #self.reftracker.dump_stats(note=note)
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
149
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
150
    def _show_stats_for(self, a_dict, label, note, tuple_key=False):
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
151
        """Dump statistics about a given dictionary.
152
153
        By the key and value need to support len().
154
        """
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
155
        count = len(a_dict)
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
156
        if tuple_key:
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
157
            size = sum(map(len, (''.join(k) for k in a_dict)))
0.64.159 by Ian Clatworthy
make the file-id cache optional and branch-ref aware
158
        else:
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
159
            size = sum(map(len, a_dict))
160
        size += sum(map(len, a_dict.values()))
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
161
        size = size * 1.0 / 1024
162
        unit = 'K'
163
        if size > 1024:
164
            size = size / 1024
165
            unit = 'M'
166
            if size > 1024:
167
                size = size / 1024
168
                unit = 'G'
169
        note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
0.123.3 by Jelmer Vernooij
Fix some imports.
170
            single_plural(count, "item", "items")))
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
171
172
    def clear_all(self):
173
        """Free up any memory used by the caches."""
174
        self._blobs.clear()
175
        self._sticky_blobs.clear()
0.129.2 by Jelmer Vernooij
Use lookup functions for committish.
176
        self.marks.clear()
0.123.6 by Jelmer Vernooij
Split out reftracker.
177
        self.reftracker.clear()
0.64.153 by Ian Clatworthy
clear caches before packing; show cache stats in verbose mode
178
        self.inventories.clear()
179
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
180
    def _flush_blobs_to_disk(self):
6656.1.1 by Martin
Apply 2to3 dict fixer and clean up resulting mess using view helpers
181
        blobs = list(self._sticky_blobs)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
182
        sticky_blobs = self._sticky_blobs
183
        total_blobs = len(sticky_blobs)
184
        blobs.sort(key=lambda k:len(sticky_blobs[k]))
185
        if self._tempdir is None:
0.123.3 by Jelmer Vernooij
Fix some imports.
186
            tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
187
            self._tempdir = tempdir
188
            self._cleanup.tempdir = self._tempdir
189
            self._cleanup.small_blobs = tempfile.TemporaryFile(
190
                prefix='small-blobs-', dir=self._tempdir)
191
            small_blob_ref = weakref.ref(self._cleanup.small_blobs)
192
            # Even though we add it to _Cleanup it seems that the object can be
193
            # destroyed 'too late' for cleanup to actually occur. Probably a
194
            # combination of bzr's "die directly, don't clean up" and how
195
            # exceptions close the running stack.
196
            def exit_cleanup():
197
                small_blob = small_blob_ref()
198
                if small_blob is not None:
199
                    small_blob.close()
200
                shutil.rmtree(tempdir, ignore_errors=True)
201
            atexit.register(exit_cleanup)
202
        count = 0
203
        bytes = 0
204
        n_small_bytes = 0
205
        while self._sticky_memory_bytes > self._sticky_flushed_size:
206
            id = blobs.pop()
207
            blob = self._sticky_blobs.pop(id)
208
            n_bytes = len(blob)
209
            self._sticky_memory_bytes -= n_bytes
210
            if n_bytes < self._small_blob_threshold:
211
                f = self._cleanup.small_blobs
212
                f.seek(0, os.SEEK_END)
213
                self._disk_blobs[id] = (f.tell(), n_bytes, None)
214
                f.write(blob)
215
                n_small_bytes += n_bytes
216
            else:
217
                fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
218
                os.write(fd, blob)
219
                os.close(fd)
220
                self._disk_blobs[id] = (0, n_bytes, name)
221
            bytes += n_bytes
222
            del blob
223
            count += 1
224
        trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
225
                   % (count, total_blobs, bytes / 1024. / 1024,
226
                      n_small_bytes / 1024. / 1024))
227
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
228
    def store_blob(self, id, data):
229
        """Store a blob of data."""
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
230
        # Note: If we're not reference counting, everything has to be sticky
231
        if not self._blob_ref_counts or id in self._blob_ref_counts:
232
            self._sticky_blobs[id] = data
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
233
            self._sticky_memory_bytes += len(data)
234
            if self._sticky_memory_bytes > self._sticky_cache_size:
235
                self._flush_blobs_to_disk()
0.64.169 by Ian Clatworthy
fix blob tracking when -v not given
236
        elif data == '':
237
            # Empty data is always sticky
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
238
            self._sticky_blobs[id] = data
239
        else:
240
            self._blobs[id] = data
241
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
242
    def _decref(self, id, cache, fn):
243
        if not self._blob_ref_counts:
244
            return False
245
        count = self._blob_ref_counts.get(id, None)
246
        if count is not None:
247
            count -= 1
248
            if count <= 0:
249
                del cache[id]
250
                if fn is not None:
251
                    os.unlink(fn)
252
                del self._blob_ref_counts[id]
253
                return True
254
            else:
255
                self._blob_ref_counts[id] = count
256
        return False
257
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
258
    def fetch_blob(self, id):
259
        """Fetch a blob of data."""
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
260
        if id in self._blobs:
0.78.3 by Ian Clatworthy
move GenericCacheManager into its own module
261
            return self._blobs.pop(id)
0.64.264 by Ian Clatworthy
Merge John's smarter caching of blobs to improve memory footprint
262
        if id in self._disk_blobs:
263
            (offset, n_bytes, fn) = self._disk_blobs[id]
264
            if fn is None:
265
                f = self._cleanup.small_blobs
266
                f.seek(offset)
267
                content = f.read(n_bytes)
268
            else:
269
                fp = open(fn, 'rb')
270
                try:
271
                    content = fp.read()
272
                finally:
273
                    fp.close()
274
            self._decref(id, self._disk_blobs, fn)
275
            return content
276
        content = self._sticky_blobs[id]
277
        if self._decref(id, self._sticky_blobs, None):
278
            self._sticky_memory_bytes -= len(content)
279
        return content