/brz/remove-bazaar

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
1
# Copyright (C) 2008 Canonical Ltd
2
#
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
7
#
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
# GNU General Public License for more details.
12
#
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
"""Import processor that dump stats about the input (and doesn't import)."""
18
19
20
from bzrlib.trace import (
21
    note,
22
    warning,
23
    )
24
from bzrlib.plugins.fastimport import (
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
25
    cache_manager,
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
26
    commands,
0.64.30 by Ian Clatworthy
add heads analysis to info processor
27
    helpers,
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
28
    processor,
29
    )
30
31
32
class InfoProcessor(processor.ImportProcessor):
33
    """An import processor that dumps statistics about the input.
34
35
    No changes to the current repository are made.
36
37
    As well as providing useful information about an import
38
    stream before importing it, this processor is useful for
39
    benchmarking the speed at which data can be extracted from
40
    the source.
41
    """
42
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
43
    def __init__(self, target=None, params=None, verbose=0, outf=None):
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
44
        # Allow creation without a target
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
45
        processor.ImportProcessor.__init__(self, target, params, verbose,
46
            outf=outf)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
47
48
    def pre_process(self):
0.64.213 by Ian Clatworthy
Smarter blob tracking by implicitly collecting statistics before starting the import
49
        self.note("Collecting statistics ...")
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
50
        # Init statistics
51
        self.cmd_counts = {}
52
        for cmd in commands.COMMAND_NAMES:
53
            self.cmd_counts[cmd] = 0
54
        self.file_cmd_counts = {}
55
        for fc in commands.FILE_COMMAND_NAMES:
56
            self.file_cmd_counts[fc] = 0
57
        self.parent_counts = {}
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
58
        self.max_parent_count = 0
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
59
        self.committers = set()
60
        self.separate_authors_found = False
61
        self.symlinks_found = False
62
        self.executables_found = False
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
63
        self.sha_blob_references = False
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
64
        self.lightweight_tags = 0
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
65
        # Blob usage tracking
66
        self.blobs = {}
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
67
        for usage in ['new', 'used', 'unknown', 'unmarked']:
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
68
            self.blobs[usage] = set()
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
69
        self.blob_ref_counts = {}
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
70
        # Head tracking - delegate to the cache manager
71
        self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
72
        # Stuff to cache: a map from mark to # of times that mark is merged
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
73
        self.merges = {}
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
74
        # Stuff to cache: these are maps from mark to sets
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
75
        self.rename_old_paths = {}
76
        self.copy_source_paths = {}
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
77
78
    def post_process(self):
79
        # Dump statistics
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
80
        cmd_names = commands.COMMAND_NAMES
81
        fc_names = commands.FILE_COMMAND_NAMES
82
        cmd_values = [self.cmd_counts[c] for c in cmd_names]
83
        fc_values = [self.file_cmd_counts[c] for c in fc_names]
84
        self._dump_stats_group("Command counts", cmd_names, cmd_values, str)
85
        self._dump_stats_group("File command counts", fc_names, fc_values, str)
86
87
        # Commit stats
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
88
        if self.cmd_counts['commit']:
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
89
            p_names = []
90
            p_values = []
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
91
            for i in xrange(0, self.max_parent_count + 1):
92
                if i in self.parent_counts:
93
                    count = self.parent_counts[i]
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
94
                    p_names.append("parents-%d" % i)
95
                    p_values.append(count)
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
96
            merges_count = len(self.merges.keys())
97
            p_names.append('total revisions merged')
98
            p_values.append(merges_count)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
99
            flags = {
100
                'separate authors found': self.separate_authors_found,
101
                'executables': self.executables_found,
102
                'symlinks': self.symlinks_found,
103
                'blobs referenced by SHA': self.sha_blob_references,
104
                }
105
            self._dump_stats_group("Parent counts", p_names, p_values, str)
106
            self._dump_stats_group("Commit analysis", flags.keys(),
107
                flags.values(), _found)
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
108
            heads = helpers.invert_dictset(self.cache_mgr.heads)
0.64.30 by Ian Clatworthy
add heads analysis to info processor
109
            self._dump_stats_group("Head analysis", heads.keys(),
110
                heads.values(), None, _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
111
            # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
112
            self._dump_stats_group("Merges", self.merges.keys(),
113
                self.merges.values(), None)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
114
            # We only show the rename old path and copy source paths when -vv
115
            # (verbose=2) is specified. The output here for mysql's data can't
116
            # be parsed currently so this bit of code needs more work anyhow ..
117
            if self.verbose >= 2:
118
                self._dump_stats_group("Rename old paths",
119
                    self.rename_old_paths.keys(),
120
                    self.rename_old_paths.values(), len,
121
                    _iterable_as_config_list)
122
                self._dump_stats_group("Copy source paths",
123
                    self.copy_source_paths.keys(),
124
                    self.copy_source_paths.values(), len,
125
                    _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
126
127
        # Blob stats
128
        if self.cmd_counts['blob']:
129
            # In verbose mode, don't list every blob used
130
            if self.verbose:
131
                del self.blobs['used']
132
            self._dump_stats_group("Blob usage tracking", self.blobs.keys(),
0.64.30 by Ian Clatworthy
add heads analysis to info processor
133
                self.blobs.values(), len, _iterable_as_config_list)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
134
        if self.blob_ref_counts:
135
            blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
136
            self._dump_stats_group("Blob reference counts",
137
                blobs_by_count.keys(),
138
                blobs_by_count.values(), len, _iterable_as_config_list)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
139
140
        # Other stats
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
141
        if self.cmd_counts['reset']:
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
142
            reset_stats = {
143
                'lightweight tags': self.lightweight_tags,
144
                }
145
            self._dump_stats_group("Reset analysis", reset_stats.keys(),
146
                reset_stats.values())
147
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
148
    def _dump_stats_group(self, title, names, values, normal_formatter=None,
149
        verbose_formatter=None):
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
150
        """Dump a statistics group.
151
        
152
        In verbose mode, do so as a config file so
153
        that other processors can load the information if they want to.
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
154
        :param normal_formatter: the callable to apply to the value
155
          before displaying it in normal mode
156
        :param verbose_formatter: the callable to apply to the value
157
          before displaying it in verbose mode
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
158
        """
159
        if self.verbose:
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
160
            self.outf.write("[%s]\n" % (title,))
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
161
            for name, value in zip(names, values):
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
162
                if verbose_formatter is not None:
163
                    value = verbose_formatter(value)
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
164
                if type(name) == str:
165
                    name = name.replace(' ', '-')
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
166
                self.outf.write("%s = %s\n" % (name, value))
167
            self.outf.write("\n")
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
168
        else:
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
169
            self.outf.write("%s:\n" % (title,))
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
170
            for name, value in zip(names, values):
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
171
                if normal_formatter is not None:
172
                    value = normal_formatter(value)
0.64.212 by Ian Clatworthy
configrable output stream for InfoProcessor
173
                self.outf.write("\t%s\t%s\n" % (value, name))
0.64.9 by Ian Clatworthy
dump parameter for info processor
174
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
175
    def progress_handler(self, cmd):
176
        """Process a ProgressCommand."""
177
        self.cmd_counts[cmd.name] += 1
178
179
    def blob_handler(self, cmd):
180
        """Process a BlobCommand."""
181
        self.cmd_counts[cmd.name] += 1
0.64.35 by Ian Clatworthy
identify unmarked blobs and commits by line numbers
182
        if cmd.mark is None:
183
            self.blobs['unmarked'].add(cmd.id)
184
        else:
185
            self.blobs['new'].add(cmd.id)
0.64.39 by Ian Clatworthy
fix blob tracking when marks are reused
186
            # Marks can be re-used so remove it from used if already there.
187
            # Note: we definitely do NOT want to remove it from multi if
188
            # it's already in that set.
189
            try:
190
                self.blobs['used'].remove(cmd.id)
191
            except KeyError:
192
                pass
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
193
194
    def checkpoint_handler(self, cmd):
195
        """Process a CheckpointCommand."""
196
        self.cmd_counts[cmd.name] += 1
197
198
    def commit_handler(self, cmd):
199
        """Process a CommitCommand."""
200
        self.cmd_counts[cmd.name] += 1
201
        self.committers.add(cmd.committer)
202
        if cmd.author is not None:
203
            self.separate_authors_found = True
204
        for fc in cmd.file_iter():
205
            self.file_cmd_counts[fc.name] += 1
206
            if isinstance(fc, commands.FileModifyCommand):
207
                if fc.is_executable:
208
                    self.executables_found = True
209
                if fc.kind == commands.SYMLINK_KIND:
210
                    self.symlinks_found = True
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
211
                if fc.dataref is not None:
212
                    if fc.dataref[0] == ':':
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
213
                        self._track_blob(fc.dataref)
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
214
                    else:
215
                        self.sha_blob_references = True
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
216
            elif isinstance(fc, commands.FileRenameCommand):
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
217
                self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
218
            elif isinstance(fc, commands.FileCopyCommand):
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
219
                self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
220
0.64.30 by Ian Clatworthy
add heads analysis to info processor
221
        # Track the heads
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
222
        parents = self.cache_mgr.track_heads(cmd)
0.64.127 by Ian Clatworthy
add total merges to fast-import-info & record when old/src paths no longer required
223
224
        # Track the parent counts
0.64.105 by Ian Clatworthy
handle > 16 parents in fast-import-info
225
        parent_count = len(parents)
226
        if self.parent_counts.has_key(parent_count):
227
            self.parent_counts[parent_count] += 1
228
        else:
229
            self.parent_counts[parent_count] = 1
230
            if parent_count > self.max_parent_count:
231
                self.max_parent_count = parent_count
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
232
0.81.11 by Ian Clatworthy
extend fast-import-info with merges, rename old paths & copy source paths
233
        # Remember the merges
234
        if cmd.merges:
235
            #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
236
            for merge in cmd.merges:
237
                if merge in self.merges:
238
                    self.merges[merge] += 1
239
                else:
240
                    self.merges[merge] = 1
241
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
242
    def reset_handler(self, cmd):
243
        """Process a ResetCommand."""
244
        self.cmd_counts[cmd.name] += 1
0.64.12 by Ian Clatworthy
lightweight tags, filter processor and param validation
245
        if cmd.ref.startswith('refs/tags/'):
246
            self.lightweight_tags += 1
247
        else:
0.83.2 by Ian Clatworthy
update fast-import-info to use same head-tracking code as fast-import
248
            if cmd.from_ is not None:
249
                self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
250
251
    def tag_handler(self, cmd):
252
        """Process a TagCommand."""
253
        self.cmd_counts[cmd.name] += 1
254
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
255
    def feature_handler(self, cmd):
256
        """Process a FeatureCommand."""
257
        self.cmd_counts[cmd.name] += 1
0.102.11 by Ian Clatworthy
Validate features are known before importing
258
        feature = cmd.feature_name
259
        if feature not in commands.FEATURE_NAMES:
260
            self.warning("feature %s is not supported - parsing may fail"
261
                % (feature,))
0.102.9 by Ian Clatworthy
parsing of multiple authors and commit properties
262
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
263
    def _track_blob(self, mark):
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
264
        if mark in self.blob_ref_counts:
265
            self.blob_ref_counts[mark] += 1
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
266
            pass
267
        elif mark in self.blobs['used']:
0.64.168 by Ian Clatworthy
blob reference counting, not just sticky vs otherwise
268
            self.blob_ref_counts[mark] = 2
0.64.23 by Ian Clatworthy
blob tracking analysis and verbose mode for info processor
269
            self.blobs['used'].remove(mark)
270
        elif mark in self.blobs['new']:
271
            self.blobs['used'].add(mark)
272
            self.blobs['new'].remove(mark)
273
        else:
274
            self.blobs['unknown'].add(mark)
0.64.1 by Ian Clatworthy
1st cut: gfi parser + --info processing method
275
276
def _found(b):
277
    """Format a found boolean as a string."""
278
    return ['no', 'found'][b]
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
279
0.64.30 by Ian Clatworthy
add heads analysis to info processor
280
def _iterable_as_config_list(s):
281
    """Format an iterable as a sequence of comma-separated strings.
0.64.24 by Ian Clatworthy
smart blob caching using analysis done by --info
282
    
283
    To match what ConfigObj expects, a single item list has a trailing comma.
284
    """
285
    items = sorted(s)
286
    if len(items) == 1:
287
        return "%s," % (items[0],)
288
    else:
289
        return ", ".join(items)