1
# -*- coding: utf-8 -*-
3
# Copyright (C) 2008 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program. If not, see <http://www.gnu.org/licenses/>.
18
# Based on bzr-fast-export
19
# Copyright (c) 2008 Adeodato Simó
21
# Permission is hereby granted, free of charge, to any person obtaining
22
# a copy of this software and associated documentation files (the
23
# "Software"), to deal in the Software without restriction, including
24
# without limitation the rights to use, copy, modify, merge, publish,
25
# distribute, sublicense, and/or sell copies of the Software, and to
26
# permit persons to whom the Software is furnished to do so, subject to
27
# the following conditions:
29
# The above copyright notice and this permission notice shall be included
30
# in all copies or substantial portions of the Software.
32
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40
# vim: fileencoding=utf-8
42
"""Core engine for the fast-export command."""
44
# TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
45
# is not updated (because the parent of commit is already merged, so we don't
46
# set new_git_branch to the previously used name)
48
from email.Utils import parseaddr
52
import bzrlib.revision
61
from bzrlib.plugins.fastimport import (
66
from fastimport import commands
67
from bzrlib.plugins.fastimport.helpers import (
73
def _get_output_stream(destination):
74
if destination is None or destination == '-':
75
return binary_stream(sys.stdout)
76
elif destination.endswith('gz'):
78
return gzip.open(destination, 'wb')
80
return open(destination, 'wb')
83
def check_ref_format(refname):
84
"""Check if a refname is correctly formatted.
86
Implements all the same rules as git-check-ref-format[1].
88
[1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
90
:param refname: The refname to check
91
:return: True if refname is valid, False otherwise
93
# These could be combined into one big expression, but are listed separately
95
if '/.' in refname or refname.startswith('.'):
97
if '/' not in refname:
102
if ord(c) < 040 or c in '\177 ~^:?*[':
104
if refname[-1] in '/.':
106
if refname.endswith('.lock'):
115
def sanitize_ref_name_for_git(refname):
116
"""Rewrite refname so that it will be accepted by git-fast-import.
117
For the detailed rules see check_ref_format.
119
By rewriting the refname we are breaking uniqueness guarantees provided by bzr
120
so we have to manually
121
verify that resulting ref names are unique.
123
:param refname: refname to rewrite
126
new_refname = re.sub(
127
# '/.' in refname or startswith '.'
132
r"|[" + "".join([chr(x) for x in range(040)]) + r"]"
147
class BzrFastExporter(object):
149
def __init__(self, source, outf, ref=None, checkpoint=-1,
150
import_marks_file=None, export_marks_file=None, revision=None,
151
verbose=False, plain_format=False, rewrite_tags=False,
152
no_tags=False, baseline=False):
153
"""Export branch data in fast import format.
155
:param plain_format: if True, 'classic' fast-import format is
156
used without any extended features; if False, the generated
157
data is richer and includes information like multiple
158
authors, revision properties, etc.
159
:param rewrite_tags: if True and if plain_format is set, tag names
160
will be rewritten to be git-compatible.
161
Otherwise tags which aren't valid for git will be skipped if
163
:param no_tags: if True tags won't be exported at all
168
self.checkpoint = checkpoint
169
self.import_marks_file = import_marks_file
170
self.export_marks_file = export_marks_file
171
self.revision = revision
172
self.excluded_revisions = set()
173
self.plain_format = plain_format
174
self.rewrite_tags = rewrite_tags
175
self.no_tags = no_tags
176
self.baseline = baseline
177
self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
178
'get_apparent_authors')
179
self.properties_to_exclude = ['authors', 'author']
181
# Progress reporting stuff
182
self.verbose = verbose
184
self.progress_every = 100
186
self.progress_every = 1000
187
self._start_time = time.time()
188
self._commit_total = 0
190
# Load the marks and initialise things accordingly
191
self.revid_to_mark = {}
192
self.branch_names = {}
193
if self.import_marks_file:
194
marks_info = marks_file.import_marks(self.import_marks_file)
195
if marks_info is not None:
196
self.revid_to_mark = dict((r, m) for m, r in
198
# These are no longer included in the marks file
199
#self.branch_names = marks_info[1]
201
def interesting_history(self):
203
rev1, rev2 = builtins._get_revision_range(self.revision,
204
self.branch, "fast-export")
205
start_rev_id = rev1.rev_id
206
end_rev_id = rev2.rev_id
210
self.note("Calculating the revisions to include ...")
211
view_revisions = [rev_id for rev_id, _, _, _ in
212
self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)]
213
view_revisions.reverse()
214
# If a starting point was given, we need to later check that we don't
215
# start emitting revisions from before that point. Collect the
216
# revisions to exclude now ...
217
if start_rev_id is not None:
218
self.note("Calculating the revisions to exclude ...")
219
self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
220
self.branch.iter_merge_sorted_revisions(start_rev_id)])
222
# needed so the first relative commit knows its parent
223
self.excluded_revisions.remove(start_rev_id)
224
view_revisions.insert(0, start_rev_id)
225
return list(view_revisions)
229
self.branch.repository.lock_read()
231
interesting = self.interesting_history()
232
self._commit_total = len(interesting)
233
self.note("Starting export of %d revisions ..." %
235
if not self.plain_format:
238
self.emit_baseline(interesting.pop(0), self.ref)
239
for revid in interesting:
240
self.emit_commit(revid, self.ref)
241
if self.branch.supports_tags() and not self.no_tags:
244
self.branch.repository.unlock()
246
# Save the marks if requested
250
def note(self, msg, *args):
251
"""Output a note but timestamp it."""
252
msg = "%s %s" % (self._time_of_day(), msg)
253
trace.note(msg, *args)
255
def warning(self, msg, *args):
256
"""Output a warning but timestamp it."""
257
msg = "%s WARNING: %s" % (self._time_of_day(), msg)
258
trace.warning(msg, *args)
260
def _time_of_day(self):
261
"""Time of day as a string."""
262
# Note: this is a separate method so tests can patch in a fixed value
263
return time.strftime("%H:%M:%S")
265
def report_progress(self, commit_count, details=''):
266
if commit_count and commit_count % self.progress_every == 0:
267
if self._commit_total:
268
counts = "%d/%d" % (commit_count, self._commit_total)
270
counts = "%d" % (commit_count,)
271
minutes = (time.time() - self._start_time) / 60
272
rate = commit_count * 1.0 / minutes
274
rate_str = "at %.0f/minute " % rate
276
rate_str = "at %.1f/minute " % rate
277
self.note("%s commits exported %s%s" % (counts, rate_str, details))
279
def dump_stats(self):
280
time_required = progress.str_tdelta(time.time() - self._start_time)
281
rc = len(self.revid_to_mark)
282
self.note("Exported %d %s in %s",
283
rc, single_plural(rc, "revision", "revisions"),
286
def print_cmd(self, cmd):
287
self.outf.write("%r\n" % cmd)
289
def _save_marks(self):
290
if self.export_marks_file:
291
revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
292
marks_file.export_marks(self.export_marks_file, revision_ids)
294
def is_empty_dir(self, tree, path):
295
path_id = tree.path2id(path)
297
self.warning("Skipping empty_dir detection - no file_id for %s" %
301
# Continue if path is not a directory
302
if tree.kind(path_id) != 'directory':
305
# Use treewalk to find the contents of our directory
306
contents = list(tree.walkdirs(prefix=path))[0]
307
if len(contents[1]) == 0:
312
def emit_features(self):
313
for feature in sorted(commands.FEATURE_NAMES):
314
self.print_cmd(commands.FeatureCommand(feature))
316
def emit_baseline(self, revid, ref):
317
# Emit a full source tree of the first commit's parent
318
revobj = self.branch.repository.get_revision(revid)
320
self.revid_to_mark[revid] = mark
321
file_cmds = self._get_filecommands(bzrlib.revision.NULL_REVISION, revid)
322
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
324
def emit_commit(self, revid, ref):
325
if revid in self.revid_to_mark or revid in self.excluded_revisions:
328
# Get the Revision object
330
revobj = self.branch.repository.get_revision(revid)
331
except bazErrors.NoSuchRevision:
332
# This is a ghost revision. Mark it as not found and next!
333
self.revid_to_mark[revid] = -1
336
# Get the primary parent
337
# TODO: Consider the excluded revisions when deciding the parents.
338
# Currently, a commit with parents that are excluded ought to be
339
# triggering the ref calculation below (and it is not).
341
ncommits = len(self.revid_to_mark)
342
nparents = len(revobj.parent_ids)
345
# This is a parentless commit but it's not the first one
346
# output. We need to create a new temporary branch for it
347
# otherwise git-fast-import will assume the previous commit
348
# was this one's parent
349
ref = self._next_tmp_ref()
350
parent = bzrlib.revision.NULL_REVISION
352
parent = revobj.parent_ids[0]
356
self.revid_to_mark[revid] = mark
357
file_cmds = self._get_filecommands(parent, revid)
358
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
360
# Report progress and checkpoint if it's time for that
361
self.report_progress(ncommits)
362
if (self.checkpoint > 0 and ncommits
363
and ncommits % self.checkpoint == 0):
364
self.note("Exported %i commits - adding checkpoint to output"
367
self.print_cmd(commands.CheckpointCommand())
369
def _get_name_email(self, user):
370
if user.find('<') == -1:
371
# If the email isn't inside <>, we need to use it as the name
372
# in order for things to round-trip correctly.
373
# (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
377
name, email = parseaddr(user)
378
return name.encode("utf-8"), email.encode("utf-8")
380
def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
381
# Get the committer and author info
382
committer = revobj.committer
383
name, email = self._get_name_email(committer)
384
committer_info = (name, email, revobj.timestamp, revobj.timezone)
385
if self._multi_author_api_available:
386
more_authors = revobj.get_apparent_authors()
387
author = more_authors.pop(0)
390
author = revobj.get_apparent_author()
391
if not self.plain_format and more_authors:
392
name, email = self._get_name_email(author)
393
author_info = (name, email, revobj.timestamp, revobj.timezone)
394
more_author_info = []
395
for a in more_authors:
396
name, email = self._get_name_email(a)
397
more_author_info.append(
398
(name, email, revobj.timestamp, revobj.timezone))
399
elif author != committer:
400
name, email = self._get_name_email(author)
401
author_info = (name, email, revobj.timestamp, revobj.timezone)
402
more_author_info = None
405
more_author_info = None
407
# Get the parents in terms of marks
408
non_ghost_parents = []
409
for p in revobj.parent_ids:
410
if p in self.excluded_revisions:
413
parent_mark = self.revid_to_mark[p]
414
non_ghost_parents.append(":%s" % parent_mark)
418
if non_ghost_parents:
419
from_ = non_ghost_parents[0]
420
merges = non_ghost_parents[1:]
425
# Filter the revision properties. Some metadata (like the
426
# author information) is already exposed in other ways so
427
# don't repeat it here.
428
if self.plain_format:
431
properties = revobj.properties
432
for prop in self.properties_to_exclude:
438
# Build and return the result
439
return commands.CommitCommand(git_ref, mark, author_info,
440
committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds),
441
more_authors=more_author_info, properties=properties)
443
def _get_revision_trees(self, parent, revision_id):
445
tree_old = self.branch.repository.revision_tree(parent)
446
except bazErrors.UnexpectedInventoryFormat:
447
self.warning("Parent is malformed - diffing against previous parent")
448
# We can't find the old parent. Let's diff against his parent
449
pp = self.branch.repository.get_revision(parent)
450
tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
453
tree_new = self.branch.repository.revision_tree(revision_id)
454
except bazErrors.UnexpectedInventoryFormat:
455
# We can't really do anything anymore
456
self.warning("Revision %s is malformed - skipping" % revision_id)
457
return tree_old, tree_new
459
def _get_filecommands(self, parent, revision_id):
460
"""Get the list of FileCommands for the changes between two revisions."""
461
tree_old, tree_new = self._get_revision_trees(parent, revision_id)
462
if not(tree_old and tree_new):
463
# Something is wrong with this revision - ignore the filecommands
466
changes = tree_new.changes_from(tree_old)
468
# Make "modified" have 3-tuples, as added does
469
my_modified = [ x[0:3] for x in changes.modified ]
471
# The potential interaction between renames and deletes is messy.
473
file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
474
changes.renamed, changes.removed, revision_id, tree_old)
476
# Map kind changes to a delete followed by an add
477
for path, id_, kind1, kind2 in changes.kind_changed:
478
path = self._adjust_path_for_renames(path, renamed, revision_id)
479
# IGC: I don't understand why a delete is needed here.
480
# In fact, it seems harmful? If you uncomment this line,
481
# please file a bug explaining why you needed to.
482
#file_cmds.append(commands.FileDeleteCommand(path))
483
my_modified.append((path, id_, kind2))
485
# Record modifications
486
for path, id_, kind in changes.added + my_modified + rd_modifies:
488
text = tree_new.get_file_text(id_)
489
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
490
helpers.kind_to_mode('file', tree_new.is_executable(id_)),
492
elif kind == 'symlink':
493
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
494
helpers.kind_to_mode('symlink', False),
495
None, tree_new.get_symlink_target(id_)))
496
elif kind == 'directory':
497
if not self.plain_format:
498
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
499
helpers.kind_to_mode('directory', False),
502
self.warning("cannot export '%s' of kind %s yet - ignoring" %
506
def _process_renames_and_deletes(self, renames, deletes,
507
revision_id, tree_old):
512
# See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
513
# In a nutshell, there are several nasty cases:
515
# 1) bzr rm a; bzr mv b a; bzr commit
516
# 2) bzr mv x/y z; bzr rm x; commmit
518
# The first must come out with the delete first like this:
523
# The second case must come out with the rename first like this:
528
# So outputting all deletes first or all renames first won't work.
529
# Instead, we need to make multiple passes over the various lists to
530
# get the ordering right.
534
deleted_paths = set([p for p, _, _ in deletes])
535
for (oldpath, newpath, id_, kind,
536
text_modified, meta_modified) in renames:
537
emit = kind != 'directory' or not self.plain_format
538
if newpath in deleted_paths:
540
file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8")))
541
deleted_paths.remove(newpath)
542
if (self.is_empty_dir(tree_old, oldpath)):
543
self.note("Skipping empty dir %s in rev %s" % (oldpath,
546
#oldpath = self._adjust_path_for_renames(oldpath, renamed,
548
renamed.append([oldpath, newpath])
549
old_to_new[oldpath] = newpath
552
commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8")))
553
if text_modified or meta_modified:
554
modifies.append((newpath, id_, kind))
556
# Renaming a directory implies all children must be renamed.
557
# Note: changes_from() doesn't handle this
558
if kind == 'directory' and tree_old.kind(id_) == 'directory':
559
for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
560
if e.kind == 'directory' and self.plain_format:
562
old_child_path = osutils.pathjoin(oldpath, p)
563
new_child_path = osutils.pathjoin(newpath, p)
564
must_be_renamed[old_child_path] = new_child_path
566
# Add children not already renamed
568
renamed_already = set(old_to_new.keys())
569
still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
570
for old_child_path in sorted(still_to_be_renamed):
571
new_child_path = must_be_renamed[old_child_path]
573
self.note("implicitly renaming %s => %s" % (old_child_path,
575
file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
576
new_child_path.encode("utf-8")))
578
# Record remaining deletes
579
for path, id_, kind in deletes:
580
if path not in deleted_paths:
582
if kind == 'directory' and self.plain_format:
584
#path = self._adjust_path_for_renames(path, renamed, revision_id)
585
file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8")))
586
return file_cmds, modifies, renamed
588
def _adjust_path_for_renames(self, path, renamed, revision_id):
589
# If a previous rename is found, we should adjust the path
590
for old, new in renamed:
592
self.note("Changing path %s given rename to %s in revision %s"
593
% (path, new, revision_id))
595
elif path.startswith(old + '/'):
597
"Adjusting path %s given rename of %s to %s in revision %s"
598
% (path, old, new, revision_id))
599
path = path.replace(old + "/", new + "/")
603
for tag, revid in self.branch.tags.get_tag_dict().items():
605
mark = self.revid_to_mark[revid]
607
self.warning('not creating tag %r pointing to non-existent '
608
'revision %s' % (tag, revid))
610
git_ref = 'refs/tags/%s' % tag.encode("utf-8")
611
if self.plain_format and not check_ref_format(git_ref):
612
if self.rewrite_tags:
613
new_ref = sanitize_ref_name_for_git(git_ref)
614
self.warning('tag %r is exported as %r to be valid in git.',
618
self.warning('not creating tag %r as its name would not be '
619
'valid in git.', git_ref)
621
self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
623
def _next_tmp_ref(self):
624
"""Return a unique branch name. The name will start with "tmp"."""
626
if prefix not in self.branch_names:
627
self.branch_names[prefix] = 0
629
self.branch_names[prefix] += 1
630
prefix = '%s.%d' % (prefix, self.branch_names[prefix])
631
return 'refs/heads/%s' % prefix