1
# Copyright (C) 2009 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
from __future__ import absolute_import
19
from io import BytesIO
26
from .i18n import gettext
27
from .ui import ui_factory
30
class RenameMap(object):
31
"""Determine a mapping of renames."""
33
def __init__(self, tree):
38
def iter_edge_hashes(lines):
39
"""Iterate through the hashes of line pairs (which make up an edge).
41
The hash is truncated using a modulus to avoid excessive memory
42
consumption by the hitscount dict. A modulus of 10Mi means that the
43
maximum number of keys is 10Mi. (Keys are normally 32 bits, e.g.
46
modulus = 1024 * 1024 * 10
47
for n in range(len(lines)):
48
yield hash(tuple(lines[n:n + 2])) % modulus
50
def add_edge_hashes(self, lines, tag):
51
"""Update edge_hashes to include the given lines.
53
:param lines: The lines to update the hashes for.
54
:param tag: A tag uniquely associated with these lines (i.e. file-id)
56
for my_hash in self.iter_edge_hashes(lines):
57
self.edge_hashes.setdefault(my_hash, set()).add(tag)
59
def add_file_edge_hashes(self, tree, file_ids):
60
"""Update to reflect the hashes for files in the tree.
62
:param tree: The tree containing the files.
63
:param file_ids: A list of file_ids to perform the updates for.
65
desired_files = [(tree.id2path(f), f) for f in file_ids]
66
with ui_factory.nested_progress_bar() as task:
67
for num, (file_id, contents) in enumerate(
68
tree.iter_files_bytes(desired_files)):
69
task.update(gettext('Calculating hashes'), num, len(file_ids))
71
s.writelines(contents)
73
self.add_edge_hashes(s.readlines(), file_id)
75
def hitcounts(self, lines):
76
"""Count the number of hash hits for each tag, for the given lines.
78
Hits are weighted according to the number of tags the hash is
79
associated with; more tags means that the hash is less rare and should
81
:param lines: The lines to calculate hashes of.
82
:return: a dict of {tag: hitcount}
85
for my_hash in self.iter_edge_hashes(lines):
86
tags = self.edge_hashes.get(my_hash)
93
hits[tag] += 1.0 / taglen
96
def get_all_hits(self, paths):
97
"""Find all the hit counts for the listed paths in the tree.
99
:return: A list of tuples of count, path, file_id.
102
with ui_factory.nested_progress_bar() as task:
103
for num, path in enumerate(paths):
104
task.update(gettext('Determining hash hits'), num, len(paths))
105
hits = self.hitcounts(self.tree.get_file_lines(path))
106
all_hits.extend((v, path, k) for k, v in hits.items())
109
def file_match(self, paths):
110
"""Return a mapping from file_ids to the supplied paths."""
111
return self._match_hits(self.get_all_hits(paths))
114
def _match_hits(hit_list):
115
"""Using a hit list, determine a path-to-fileid map.
117
The hit list is a list of (count, path, file_id), where count is a
118
(possibly float) number, with higher numbers indicating stronger
121
seen_file_ids = set()
123
for count, path, file_id in sorted(hit_list, reverse=True):
124
if path in path_map or file_id in seen_file_ids:
126
path_map[path] = file_id
127
seen_file_ids.add(file_id)
130
def get_required_parents(self, matches):
131
"""Return a dict of all file parents that must be versioned.
133
The keys are the required parents and the values are sets of their
136
required_parents = {}
140
path = osutils.dirname(path)
141
if self.tree.is_versioned(path):
143
required_parents.setdefault(path, []).append(child)
145
for parent, children in required_parents.items():
146
child_file_ids = set()
147
for child in children:
148
file_id = matches.get(child)
149
if file_id is not None:
150
child_file_ids.add(file_id)
151
require_ids[parent] = child_file_ids
154
def match_parents(self, required_parents, missing_parents):
155
"""Map parent directories to file-ids.
157
This is done by finding similarity between the file-ids of children of
158
required parent directories and the file-ids of children of missing
162
for file_id, file_id_children in missing_parents.items():
163
for path, path_children in required_parents.items():
164
hits = len(path_children.intersection(file_id_children))
166
all_hits.append((hits, path, file_id))
167
return self._match_hits(all_hits)
169
def _find_missing_files(self, basis):
170
missing_files = set()
172
candidate_files = set()
173
with ui_factory.nested_progress_bar() as task:
174
iterator = self.tree.iter_changes(basis, want_unversioned=True,
176
for change in iterator:
177
if change.kind[1] is None and change.versioned[1]:
178
if not self.tree.has_filename(
179
self.tree.id2path(change.parent_id[0])):
180
missing_parents.setdefault(
181
change.parent_id[0], set()).add(change.file_id)
182
if change.kind[0] == 'file':
183
missing_files.add(change.file_id)
185
# other kinds are not handled
187
if change.versioned == (False, False):
188
if self.tree.is_ignored(change.path[1]):
190
if change.kind[1] == 'file':
191
candidate_files.add(change.path[1])
192
if change.kind[1] == 'directory':
193
for _dir, children in self.tree.walkdirs(change.path[1]):
194
for child in children:
195
if child[2] == 'file':
196
candidate_files.add(child[0])
197
return missing_files, missing_parents, candidate_files
200
def guess_renames(klass, from_tree, to_tree, dry_run=False):
201
"""Guess which files to rename, and perform the rename.
203
We assume that unversioned files and missing files indicate that
204
versioned files have been renamed outside of Bazaar.
206
:param from_tree: A tree to compare from
207
:param to_tree: A write-locked working tree.
209
required_parents = {}
210
with ui_factory.nested_progress_bar() as task:
211
pp = progress.ProgressPhase('Guessing renames', 4, task)
212
with from_tree.lock_read():
215
missing_files, missing_parents, candidate_files = (
216
rn._find_missing_files(from_tree))
218
rn.add_file_edge_hashes(from_tree, missing_files)
220
matches = rn.file_match(candidate_files)
221
parents_matches = matches
222
while len(parents_matches) > 0:
223
required_parents = rn.get_required_parents(
225
parents_matches = rn.match_parents(required_parents,
227
matches.update(parents_matches)
229
delta = rn._make_inventory_delta(matches)
230
for old, new, file_id, entry in delta:
231
trace.note(gettext("{0} => {1}").format(old, new))
233
to_tree.add(required_parents)
234
to_tree.apply_inventory_delta(delta)
236
def _make_inventory_delta(self, matches):
238
file_id_matches = dict((f, p) for p, f in matches.items())
240
for f in matches.values():
242
file_id_query.append(self.tree.id2path(f))
243
except errors.NoSuchId:
245
for old_path, entry in self.tree.iter_entries_by_dir(
246
specific_files=file_id_query):
247
new_path = file_id_matches[entry.file_id]
248
parent_path, new_name = osutils.split(new_path)
249
parent_id = matches.get(parent_path)
250
if parent_id is None:
251
parent_id = self.tree.path2id(parent_path)
252
if parent_id is None:
253
added, ignored = self.tree.smart_add(
254
[parent_path], recurse=False)
255
if len(ignored) > 0 and ignored[0] == parent_path:
258
parent_id = self.tree.path2id(parent_path)
259
if entry.name == new_name and entry.parent_id == parent_id:
261
new_entry = entry.copy()
262
new_entry.parent_id = parent_id
263
new_entry.name = new_name
264
delta.append((old_path, new_path, new_entry.file_id, new_entry))