1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Import processor that dump stats about the input (and doesn't import)."""
20
from bzrlib.trace import (
24
from bzrlib.plugins.fastimport import (
32
class InfoProcessor(processor.ImportProcessor):
33
"""An import processor that dumps statistics about the input.
35
No changes to the current repository are made.
37
As well as providing useful information about an import
38
stream before importing it, this processor is useful for
39
benchmarking the speed at which data can be extracted from
43
def __init__(self, target=None, params=None, verbose=0, outf=None):
44
# Allow creation without a target
45
processor.ImportProcessor.__init__(self, target, params, verbose,
48
def pre_process(self):
49
self.note("Collecting statistics ...")
52
for cmd in commands.COMMAND_NAMES:
53
self.cmd_counts[cmd] = 0
54
self.file_cmd_counts = {}
55
for fc in commands.FILE_COMMAND_NAMES:
56
self.file_cmd_counts[fc] = 0
57
self.parent_counts = {}
58
self.max_parent_count = 0
59
self.committers = set()
60
self.separate_authors_found = False
61
self.symlinks_found = False
62
self.executables_found = False
63
self.sha_blob_references = False
64
self.lightweight_tags = 0
67
for usage in ['new', 'used', 'unknown', 'unmarked']:
68
self.blobs[usage] = set()
69
self.blob_ref_counts = {}
70
# Head tracking - delegate to the cache manager
71
self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
72
# Stuff to cache: a map from mark to # of times that mark is merged
74
# Stuff to cache: these are maps from mark to sets
75
self.rename_old_paths = {}
76
self.copy_source_paths = {}
78
def post_process(self):
80
cmd_names = commands.COMMAND_NAMES
81
fc_names = commands.FILE_COMMAND_NAMES
82
cmd_values = [self.cmd_counts[c] for c in cmd_names]
83
fc_values = [self.file_cmd_counts[c] for c in fc_names]
84
self._dump_stats_group("Command counts", cmd_names, cmd_values, str)
85
self._dump_stats_group("File command counts", fc_names, fc_values, str)
88
if self.cmd_counts['commit']:
91
for i in xrange(0, self.max_parent_count + 1):
92
if i in self.parent_counts:
93
count = self.parent_counts[i]
94
p_names.append("parents-%d" % i)
95
p_values.append(count)
96
merges_count = len(self.merges.keys())
97
p_names.append('total revisions merged')
98
p_values.append(merges_count)
100
'separate authors found': self.separate_authors_found,
101
'executables': self.executables_found,
102
'symlinks': self.symlinks_found,
103
'blobs referenced by SHA': self.sha_blob_references,
105
self._dump_stats_group("Parent counts", p_names, p_values, str)
106
self._dump_stats_group("Commit analysis", flags.keys(),
107
flags.values(), _found)
108
heads = helpers.invert_dictset(self.cache_mgr.heads)
109
self._dump_stats_group("Head analysis", heads.keys(),
110
heads.values(), None, _iterable_as_config_list)
111
# note("\t%d\t%s" % (len(self.committers), 'unique committers'))
112
self._dump_stats_group("Merges", self.merges.keys(),
113
self.merges.values(), None)
114
# We only show the rename old path and copy source paths when -vv
115
# (verbose=2) is specified. The output here for mysql's data can't
116
# be parsed currently so this bit of code needs more work anyhow ..
117
if self.verbose >= 2:
118
self._dump_stats_group("Rename old paths",
119
self.rename_old_paths.keys(),
120
self.rename_old_paths.values(), len,
121
_iterable_as_config_list)
122
self._dump_stats_group("Copy source paths",
123
self.copy_source_paths.keys(),
124
self.copy_source_paths.values(), len,
125
_iterable_as_config_list)
128
if self.cmd_counts['blob']:
129
# In verbose mode, don't list every blob used
131
del self.blobs['used']
132
self._dump_stats_group("Blob usage tracking", self.blobs.keys(),
133
self.blobs.values(), len, _iterable_as_config_list)
134
if self.blob_ref_counts:
135
blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
136
self._dump_stats_group("Blob reference counts",
137
blobs_by_count.keys(),
138
blobs_by_count.values(), len, _iterable_as_config_list)
141
if self.cmd_counts['reset']:
143
'lightweight tags': self.lightweight_tags,
145
self._dump_stats_group("Reset analysis", reset_stats.keys(),
146
reset_stats.values())
148
def _dump_stats_group(self, title, names, values, normal_formatter=None,
149
verbose_formatter=None):
150
"""Dump a statistics group.
152
In verbose mode, do so as a config file so
153
that other processors can load the information if they want to.
154
:param normal_formatter: the callable to apply to the value
155
before displaying it in normal mode
156
:param verbose_formatter: the callable to apply to the value
157
before displaying it in verbose mode
160
self.outf.write("[%s]\n" % (title,))
161
for name, value in zip(names, values):
162
if verbose_formatter is not None:
163
value = verbose_formatter(value)
164
if type(name) == str:
165
name = name.replace(' ', '-')
166
self.outf.write("%s = %s\n" % (name, value))
167
self.outf.write("\n")
169
self.outf.write("%s:\n" % (title,))
170
for name, value in zip(names, values):
171
if normal_formatter is not None:
172
value = normal_formatter(value)
173
self.outf.write("\t%s\t%s\n" % (value, name))
175
def progress_handler(self, cmd):
176
"""Process a ProgressCommand."""
177
self.cmd_counts[cmd.name] += 1
179
def blob_handler(self, cmd):
180
"""Process a BlobCommand."""
181
self.cmd_counts[cmd.name] += 1
183
self.blobs['unmarked'].add(cmd.id)
185
self.blobs['new'].add(cmd.id)
186
# Marks can be re-used so remove it from used if already there.
187
# Note: we definitely do NOT want to remove it from multi if
188
# it's already in that set.
190
self.blobs['used'].remove(cmd.id)
194
def checkpoint_handler(self, cmd):
195
"""Process a CheckpointCommand."""
196
self.cmd_counts[cmd.name] += 1
198
def commit_handler(self, cmd):
199
"""Process a CommitCommand."""
200
self.cmd_counts[cmd.name] += 1
201
self.committers.add(cmd.committer)
202
if cmd.author is not None:
203
self.separate_authors_found = True
204
for fc in cmd.file_iter():
205
self.file_cmd_counts[fc.name] += 1
206
if isinstance(fc, commands.FileModifyCommand):
208
self.executables_found = True
209
if fc.kind == commands.SYMLINK_KIND:
210
self.symlinks_found = True
211
if fc.dataref is not None:
212
if fc.dataref[0] == ':':
213
self._track_blob(fc.dataref)
215
self.sha_blob_references = True
216
elif isinstance(fc, commands.FileRenameCommand):
217
self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
218
elif isinstance(fc, commands.FileCopyCommand):
219
self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
222
parents = self.cache_mgr.track_heads(cmd)
224
# Track the parent counts
225
parent_count = len(parents)
226
if self.parent_counts.has_key(parent_count):
227
self.parent_counts[parent_count] += 1
229
self.parent_counts[parent_count] = 1
230
if parent_count > self.max_parent_count:
231
self.max_parent_count = parent_count
233
# Remember the merges
235
#self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
236
for merge in cmd.merges:
237
if merge in self.merges:
238
self.merges[merge] += 1
240
self.merges[merge] = 1
242
def reset_handler(self, cmd):
243
"""Process a ResetCommand."""
244
self.cmd_counts[cmd.name] += 1
245
if cmd.ref.startswith('refs/tags/'):
246
self.lightweight_tags += 1
248
if cmd.from_ is not None:
249
self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)
251
def tag_handler(self, cmd):
252
"""Process a TagCommand."""
253
self.cmd_counts[cmd.name] += 1
255
def feature_handler(self, cmd):
256
"""Process a FeatureCommand."""
257
self.cmd_counts[cmd.name] += 1
258
feature = cmd.feature_name
259
if feature not in commands.FEATURE_NAMES:
260
self.warning("feature %s is not supported - parsing may fail"
263
def _track_blob(self, mark):
264
if mark in self.blob_ref_counts:
265
self.blob_ref_counts[mark] += 1
267
elif mark in self.blobs['used']:
268
self.blob_ref_counts[mark] = 2
269
self.blobs['used'].remove(mark)
270
elif mark in self.blobs['new']:
271
self.blobs['used'].add(mark)
272
self.blobs['new'].remove(mark)
274
self.blobs['unknown'].add(mark)
277
"""Format a found boolean as a string."""
278
return ['no', 'found'][b]
280
def _iterable_as_config_list(s):
281
"""Format an iterable as a sequence of comma-separated strings.
283
To match what ConfigObj expects, a single item list has a trailing comma.
287
return "%s," % (items[0],)
289
return ", ".join(items)