1
# -*- coding: utf-8 -*-
3
# Copyright (C) 2008 Canonical Ltd
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
15
# You should have received a copy of the GNU General Public License
16
# along with this program. If not, see <http://www.gnu.org/licenses/>.
18
# Based on bzr-fast-export
19
# Copyright (c) 2008 Adeodato Simó
21
# Permission is hereby granted, free of charge, to any person obtaining
22
# a copy of this software and associated documentation files (the
23
# "Software"), to deal in the Software without restriction, including
24
# without limitation the rights to use, copy, modify, merge, publish,
25
# distribute, sublicense, and/or sell copies of the Software, and to
26
# permit persons to whom the Software is furnished to do so, subject to
27
# the following conditions:
29
# The above copyright notice and this permission notice shall be included
30
# in all copies or substantial portions of the Software.
32
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40
# vim: fileencoding=utf-8
42
"""Core engine for the fast-export command."""
44
from __future__ import absolute_import
46
# TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
47
# is not updated (because the parent of commit is already merged, so we don't
48
# set new_git_branch to the previously used name)
50
from email.Utils import parseaddr
54
import breezy.revision
69
lazy_import.lazy_import(globals(),
71
from fastimport import commands
75
def _get_output_stream(destination):
76
if destination is None or destination == '-':
77
return helpers.binary_stream(sys.stdout)
78
elif destination.endswith('gz'):
80
return gzip.open(destination, 'wb')
82
return open(destination, 'wb')
85
def check_ref_format(refname):
86
"""Check if a refname is correctly formatted.
88
Implements all the same rules as git-check-ref-format[1].
90
[1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
92
:param refname: The refname to check
93
:return: True if refname is valid, False otherwise
95
# These could be combined into one big expression, but are listed separately
97
if '/.' in refname or refname.startswith('.'):
99
if '/' not in refname:
104
if ord(c) < 040 or c in '\177 ~^:?*[':
106
if refname[-1] in '/.':
108
if refname.endswith('.lock'):
117
def sanitize_ref_name_for_git(refname):
118
"""Rewrite refname so that it will be accepted by git-fast-import.
119
For the detailed rules see check_ref_format.
121
By rewriting the refname we are breaking uniqueness guarantees provided by bzr
122
so we have to manually
123
verify that resulting ref names are unique.
125
:param refname: refname to rewrite
128
new_refname = re.sub(
129
# '/.' in refname or startswith '.'
134
r"|[" + "".join([chr(x) for x in range(040)]) + r"]"
149
class BzrFastExporter(object):
151
def __init__(self, source, outf, ref=None, checkpoint=-1,
152
import_marks_file=None, export_marks_file=None, revision=None,
153
verbose=False, plain_format=False, rewrite_tags=False,
154
no_tags=False, baseline=False):
155
"""Export branch data in fast import format.
157
:param plain_format: if True, 'classic' fast-import format is
158
used without any extended features; if False, the generated
159
data is richer and includes information like multiple
160
authors, revision properties, etc.
161
:param rewrite_tags: if True and if plain_format is set, tag names
162
will be rewritten to be git-compatible.
163
Otherwise tags which aren't valid for git will be skipped if
165
:param no_tags: if True tags won't be exported at all
170
self.checkpoint = checkpoint
171
self.import_marks_file = import_marks_file
172
self.export_marks_file = export_marks_file
173
self.revision = revision
174
self.excluded_revisions = set()
175
self.plain_format = plain_format
176
self.rewrite_tags = rewrite_tags
177
self.no_tags = no_tags
178
self.baseline = baseline
179
self._multi_author_api_available = hasattr(breezy.revision.Revision,
180
'get_apparent_authors')
181
self.properties_to_exclude = ['authors', 'author']
183
# Progress reporting stuff
184
self.verbose = verbose
186
self.progress_every = 100
188
self.progress_every = 1000
189
self._start_time = time.time()
190
self._commit_total = 0
192
# Load the marks and initialise things accordingly
193
self.revid_to_mark = {}
194
self.branch_names = {}
195
if self.import_marks_file:
196
marks_info = marks_file.import_marks(self.import_marks_file)
197
if marks_info is not None:
198
self.revid_to_mark = dict((r, m) for m, r in
200
# These are no longer included in the marks file
201
#self.branch_names = marks_info[1]
203
def interesting_history(self):
205
rev1, rev2 = builtins._get_revision_range(self.revision,
206
self.branch, "fast-export")
207
start_rev_id = rev1.rev_id
208
end_rev_id = rev2.rev_id
212
self.note("Calculating the revisions to include ...")
213
view_revisions = [rev_id for rev_id, _, _, _ in
214
self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)]
215
view_revisions.reverse()
216
# If a starting point was given, we need to later check that we don't
217
# start emitting revisions from before that point. Collect the
218
# revisions to exclude now ...
219
if start_rev_id is not None:
220
self.note("Calculating the revisions to exclude ...")
221
self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
222
self.branch.iter_merge_sorted_revisions(start_rev_id)])
224
# needed so the first relative commit knows its parent
225
self.excluded_revisions.remove(start_rev_id)
226
view_revisions.insert(0, start_rev_id)
227
return list(view_revisions)
231
self.branch.repository.lock_read()
233
interesting = self.interesting_history()
234
self._commit_total = len(interesting)
235
self.note("Starting export of %d revisions ..." %
237
if not self.plain_format:
240
self.emit_baseline(interesting.pop(0), self.ref)
241
for revid in interesting:
242
self.emit_commit(revid, self.ref)
243
if self.branch.supports_tags() and not self.no_tags:
246
self.branch.repository.unlock()
248
# Save the marks if requested
252
def note(self, msg, *args):
253
"""Output a note but timestamp it."""
254
msg = "%s %s" % (self._time_of_day(), msg)
255
trace.note(msg, *args)
257
def warning(self, msg, *args):
258
"""Output a warning but timestamp it."""
259
msg = "%s WARNING: %s" % (self._time_of_day(), msg)
260
trace.warning(msg, *args)
262
def _time_of_day(self):
263
"""Time of day as a string."""
264
# Note: this is a separate method so tests can patch in a fixed value
265
return time.strftime("%H:%M:%S")
267
def report_progress(self, commit_count, details=''):
268
if commit_count and commit_count % self.progress_every == 0:
269
if self._commit_total:
270
counts = "%d/%d" % (commit_count, self._commit_total)
272
counts = "%d" % (commit_count,)
273
minutes = (time.time() - self._start_time) / 60
274
rate = commit_count * 1.0 / minutes
276
rate_str = "at %.0f/minute " % rate
278
rate_str = "at %.1f/minute " % rate
279
self.note("%s commits exported %s%s" % (counts, rate_str, details))
281
def dump_stats(self):
282
time_required = progress.str_tdelta(time.time() - self._start_time)
283
rc = len(self.revid_to_mark)
284
self.note("Exported %d %s in %s",
285
rc, helpers.single_plural(rc, "revision", "revisions"),
288
def print_cmd(self, cmd):
289
self.outf.write("%r\n" % cmd)
291
def _save_marks(self):
292
if self.export_marks_file:
293
revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
294
marks_file.export_marks(self.export_marks_file, revision_ids)
296
def is_empty_dir(self, tree, path):
297
path_id = tree.path2id(path)
299
self.warning("Skipping empty_dir detection - no file_id for %s" %
303
# Continue if path is not a directory
304
if tree.kind(path_id) != 'directory':
307
# Use treewalk to find the contents of our directory
308
contents = list(tree.walkdirs(prefix=path))[0]
309
if len(contents[1]) == 0:
314
def emit_features(self):
315
for feature in sorted(commands.FEATURE_NAMES):
316
self.print_cmd(commands.FeatureCommand(feature))
318
def emit_baseline(self, revid, ref):
319
# Emit a full source tree of the first commit's parent
320
revobj = self.branch.repository.get_revision(revid)
322
self.revid_to_mark[revid] = mark
323
file_cmds = self._get_filecommands(breezy.revision.NULL_REVISION, revid)
324
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
326
def emit_commit(self, revid, ref):
327
if revid in self.revid_to_mark or revid in self.excluded_revisions:
330
# Get the Revision object
332
revobj = self.branch.repository.get_revision(revid)
333
except bazErrors.NoSuchRevision:
334
# This is a ghost revision. Mark it as not found and next!
335
self.revid_to_mark[revid] = -1
338
# Get the primary parent
339
# TODO: Consider the excluded revisions when deciding the parents.
340
# Currently, a commit with parents that are excluded ought to be
341
# triggering the ref calculation below (and it is not).
343
ncommits = len(self.revid_to_mark)
344
nparents = len(revobj.parent_ids)
347
# This is a parentless commit but it's not the first one
348
# output. We need to create a new temporary branch for it
349
# otherwise git-fast-import will assume the previous commit
350
# was this one's parent
351
ref = self._next_tmp_ref()
352
parent = breezy.revision.NULL_REVISION
354
parent = revobj.parent_ids[0]
358
self.revid_to_mark[revid] = mark
359
file_cmds = self._get_filecommands(parent, revid)
360
self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
362
# Report progress and checkpoint if it's time for that
363
self.report_progress(ncommits)
364
if (self.checkpoint > 0 and ncommits
365
and ncommits % self.checkpoint == 0):
366
self.note("Exported %i commits - adding checkpoint to output"
369
self.print_cmd(commands.CheckpointCommand())
371
def _get_name_email(self, user):
372
if user.find('<') == -1:
373
# If the email isn't inside <>, we need to use it as the name
374
# in order for things to round-trip correctly.
375
# (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
379
name, email = parseaddr(user)
380
return name.encode("utf-8"), email.encode("utf-8")
382
def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
383
# Get the committer and author info
384
committer = revobj.committer
385
name, email = self._get_name_email(committer)
386
committer_info = (name, email, revobj.timestamp, revobj.timezone)
387
if self._multi_author_api_available:
388
more_authors = revobj.get_apparent_authors()
389
author = more_authors.pop(0)
392
author = revobj.get_apparent_author()
393
if not self.plain_format and more_authors:
394
name, email = self._get_name_email(author)
395
author_info = (name, email, revobj.timestamp, revobj.timezone)
396
more_author_info = []
397
for a in more_authors:
398
name, email = self._get_name_email(a)
399
more_author_info.append(
400
(name, email, revobj.timestamp, revobj.timezone))
401
elif author != committer:
402
name, email = self._get_name_email(author)
403
author_info = (name, email, revobj.timestamp, revobj.timezone)
404
more_author_info = None
407
more_author_info = None
409
# Get the parents in terms of marks
410
non_ghost_parents = []
411
for p in revobj.parent_ids:
412
if p in self.excluded_revisions:
415
parent_mark = self.revid_to_mark[p]
416
non_ghost_parents.append(":%s" % parent_mark)
420
if non_ghost_parents:
421
from_ = non_ghost_parents[0]
422
merges = non_ghost_parents[1:]
427
# Filter the revision properties. Some metadata (like the
428
# author information) is already exposed in other ways so
429
# don't repeat it here.
430
if self.plain_format:
433
properties = revobj.properties
434
for prop in self.properties_to_exclude:
440
# Build and return the result
441
return commands.CommitCommand(git_ref, str(mark), author_info,
442
committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds),
443
more_authors=more_author_info, properties=properties)
445
def _get_revision_trees(self, parent, revision_id):
447
tree_old = self.branch.repository.revision_tree(parent)
448
except bazErrors.UnexpectedInventoryFormat:
449
self.warning("Parent is malformed - diffing against previous parent")
450
# We can't find the old parent. Let's diff against his parent
451
pp = self.branch.repository.get_revision(parent)
452
tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
455
tree_new = self.branch.repository.revision_tree(revision_id)
456
except bazErrors.UnexpectedInventoryFormat:
457
# We can't really do anything anymore
458
self.warning("Revision %s is malformed - skipping" % revision_id)
459
return tree_old, tree_new
461
def _get_filecommands(self, parent, revision_id):
462
"""Get the list of FileCommands for the changes between two revisions."""
463
tree_old, tree_new = self._get_revision_trees(parent, revision_id)
464
if not(tree_old and tree_new):
465
# Something is wrong with this revision - ignore the filecommands
468
changes = tree_new.changes_from(tree_old)
470
# Make "modified" have 3-tuples, as added does
471
my_modified = [ x[0:3] for x in changes.modified ]
473
# The potential interaction between renames and deletes is messy.
475
file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
476
changes.renamed, changes.removed, revision_id, tree_old)
478
# Map kind changes to a delete followed by an add
479
for path, id_, kind1, kind2 in changes.kind_changed:
480
path = self._adjust_path_for_renames(path, renamed, revision_id)
481
# IGC: I don't understand why a delete is needed here.
482
# In fact, it seems harmful? If you uncomment this line,
483
# please file a bug explaining why you needed to.
484
#file_cmds.append(commands.FileDeleteCommand(path))
485
my_modified.append((path, id_, kind2))
487
# Record modifications
488
for path, id_, kind in changes.added + my_modified + rd_modifies:
490
text = tree_new.get_file_text(id_)
491
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
492
helpers.kind_to_mode('file', tree_new.is_executable(id_)),
494
elif kind == 'symlink':
495
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
496
helpers.kind_to_mode('symlink', False),
497
None, tree_new.get_symlink_target(id_)))
498
elif kind == 'directory':
499
if not self.plain_format:
500
file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
501
helpers.kind_to_mode('directory', False),
504
self.warning("cannot export '%s' of kind %s yet - ignoring" %
508
def _process_renames_and_deletes(self, renames, deletes,
509
revision_id, tree_old):
514
# See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
515
# In a nutshell, there are several nasty cases:
517
# 1) bzr rm a; bzr mv b a; bzr commit
518
# 2) bzr mv x/y z; bzr rm x; commmit
520
# The first must come out with the delete first like this:
525
# The second case must come out with the rename first like this:
530
# So outputting all deletes first or all renames first won't work.
531
# Instead, we need to make multiple passes over the various lists to
532
# get the ordering right.
536
deleted_paths = set([p for p, _, _ in deletes])
537
for (oldpath, newpath, id_, kind,
538
text_modified, meta_modified) in renames:
539
emit = kind != 'directory' or not self.plain_format
540
if newpath in deleted_paths:
542
file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8")))
543
deleted_paths.remove(newpath)
544
if (self.is_empty_dir(tree_old, oldpath)):
545
self.note("Skipping empty dir %s in rev %s" % (oldpath,
548
#oldpath = self._adjust_path_for_renames(oldpath, renamed,
550
renamed.append([oldpath, newpath])
551
old_to_new[oldpath] = newpath
554
commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8")))
555
if text_modified or meta_modified:
556
modifies.append((newpath, id_, kind))
558
# Renaming a directory implies all children must be renamed.
559
# Note: changes_from() doesn't handle this
560
if kind == 'directory' and tree_old.kind(id_) == 'directory':
561
for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
562
if e.kind == 'directory' and self.plain_format:
564
old_child_path = osutils.pathjoin(oldpath, p)
565
new_child_path = osutils.pathjoin(newpath, p)
566
must_be_renamed[old_child_path] = new_child_path
568
# Add children not already renamed
570
renamed_already = set(old_to_new.keys())
571
still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
572
for old_child_path in sorted(still_to_be_renamed):
573
new_child_path = must_be_renamed[old_child_path]
575
self.note("implicitly renaming %s => %s" % (old_child_path,
577
file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
578
new_child_path.encode("utf-8")))
580
# Record remaining deletes
581
for path, id_, kind in deletes:
582
if path not in deleted_paths:
584
if kind == 'directory' and self.plain_format:
586
#path = self._adjust_path_for_renames(path, renamed, revision_id)
587
file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8")))
588
return file_cmds, modifies, renamed
590
def _adjust_path_for_renames(self, path, renamed, revision_id):
591
# If a previous rename is found, we should adjust the path
592
for old, new in renamed:
594
self.note("Changing path %s given rename to %s in revision %s"
595
% (path, new, revision_id))
597
elif path.startswith(old + '/'):
599
"Adjusting path %s given rename of %s to %s in revision %s"
600
% (path, old, new, revision_id))
601
path = path.replace(old + "/", new + "/")
605
for tag, revid in self.branch.tags.get_tag_dict().items():
607
mark = self.revid_to_mark[revid]
609
self.warning('not creating tag %r pointing to non-existent '
610
'revision %s' % (tag, revid))
612
git_ref = 'refs/tags/%s' % tag.encode("utf-8")
613
if self.plain_format and not check_ref_format(git_ref):
614
if self.rewrite_tags:
615
new_ref = sanitize_ref_name_for_git(git_ref)
616
self.warning('tag %r is exported as %r to be valid in git.',
620
self.warning('not creating tag %r as its name would not be '
621
'valid in git.', git_ref)
623
self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
625
def _next_tmp_ref(self):
626
"""Return a unique branch name. The name will start with "tmp"."""
628
if prefix not in self.branch_names:
629
self.branch_names[prefix] = 0
631
self.branch_names[prefix] += 1
632
prefix = '%s.%d' % (prefix, self.branch_names[prefix])
633
return 'refs/heads/%s' % prefix