1
# Copyright (C) 2005, 2006 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Reconcilers are able to fix some potential data errors in a branch."""
35
from bzrlib.trace import mutter, note
36
from bzrlib.tsort import TopoSorter
39
def reconcile(dir, other=None):
40
"""Reconcile the data in dir.
42
Currently this is limited to a inventory 'reweave'.
44
This is a convenience method, for using a Reconciler object.
46
Directly using Reconciler is recommended for library users that
47
desire fine grained control or analysis of the found issues.
49
:param other: another bzrdir to reconcile against.
51
reconciler = Reconciler(dir, other=other)
52
reconciler.reconcile()
55
class Reconciler(object):
56
"""Reconcilers are used to reconcile existing data."""
58
def __init__(self, dir, other=None):
59
"""Create a Reconciler."""
63
"""Perform reconciliation.
65
After reconciliation the following attributes document found issues:
66
inconsistent_parents: The number of revisions in the repository whose
67
ancestry was being reported incorrectly.
68
garbage_inventories: The number of inventory objects without revisions
69
that were garbage collected.
71
self.pb = ui.ui_factory.nested_progress_bar()
78
"""Helper function for performing reconciliation."""
79
self.repo = self.bzrdir.find_repository()
80
self.pb.note('Reconciling repository %s',
81
self.repo.bzrdir.root_transport.base)
82
repo_reconciler = self.repo.reconcile(thorough=True)
83
self.inconsistent_parents = repo_reconciler.inconsistent_parents
84
self.garbage_inventories = repo_reconciler.garbage_inventories
85
if repo_reconciler.aborted:
87
'Reconcile aborted: revision index has inconsistent parents.')
89
'Run "bzr check" for more details.')
91
self.pb.note('Reconciliation complete.')
94
class RepoReconciler(object):
95
"""Reconciler that reconciles a repository.
97
The goal of repository reconciliation is to make any derived data
98
consistent with the core data committed by a user. This can involve
99
reindexing, or removing unreferenced data if that can interfere with
100
queries in a given repository.
102
Currently this consists of an inventory reweave with revision cross-checks.
105
def __init__(self, repo, other=None, thorough=False):
106
"""Construct a RepoReconciler.
108
:param thorough: perform a thorough check which may take longer but
109
will correct non-data loss issues such as incorrect
112
self.garbage_inventories = 0
113
self.inconsistent_parents = 0
116
self.thorough = thorough
119
"""Perform reconciliation.
121
After reconciliation the following attributes document found issues:
122
inconsistent_parents: The number of revisions in the repository whose
123
ancestry was being reported incorrectly.
124
garbage_inventories: The number of inventory objects without revisions
125
that were garbage collected.
127
self.repo.lock_write()
129
self.pb = ui.ui_factory.nested_progress_bar()
131
self._reconcile_steps()
137
def _reconcile_steps(self):
138
"""Perform the steps to reconcile this repository."""
139
self._reweave_inventory()
141
def _reweave_inventory(self):
142
"""Regenerate the inventory weave for the repository from scratch.
144
This is a smart function: it will only do the reweave if doing it
145
will correct data issues. The self.thorough flag controls whether
146
only data-loss causing issues (!self.thorough) or all issues
147
(self.thorough) are treated as requiring the reweave.
149
# local because needing to know about WeaveFile is a wart we want to hide
150
from bzrlib.weave import WeaveFile, Weave
151
transaction = self.repo.get_transaction()
152
self.pb.update('Reading inventory data.')
153
self.inventory = self.repo.get_inventory_weave()
154
# the total set of revisions to process
155
self.pending = set([rev_id for rev_id in self.repo._revision_store.all_revision_ids(transaction)])
157
# mapping from revision_id to parents
159
# errors that we detect
160
self.inconsistent_parents = 0
161
# we need the revision id of each revision and its available parents list
162
self._setup_steps(len(self.pending))
163
for rev_id in self.pending:
164
# put a revision into the graph.
165
self._graph_revision(rev_id)
166
self._check_garbage_inventories()
167
# if there are no inconsistent_parents and
168
# (no garbage inventories or we are not doing a thorough check)
169
if (not self.inconsistent_parents and
170
(not self.garbage_inventories or not self.thorough)):
171
self.pb.note('Inventory ok.')
173
self.pb.update('Backing up inventory...', 0, 0)
174
self.repo.control_weaves.copy(self.inventory, 'inventory.backup', self.repo.get_transaction())
175
self.pb.note('Backup Inventory created.')
176
# asking for '' should never return a non-empty weave
177
new_inventory_vf = self.repo.control_weaves.get_empty('inventory.new',
178
self.repo.get_transaction())
180
# we have topological order of revisions and non ghost parents ready.
181
self._setup_steps(len(self._rev_graph))
182
for rev_id in TopoSorter(self._rev_graph.items()).iter_topo_order():
183
parents = self._rev_graph[rev_id]
184
# double check this really is in topological order.
185
unavailable = [p for p in parents if p not in new_inventory_vf]
186
assert len(unavailable) == 0
187
# this entry has all the non ghost parents in the inventory
189
self._reweave_step('adding inventories')
190
if isinstance(new_inventory_vf, WeaveFile):
191
# It's really a WeaveFile, but we call straight into the
192
# Weave's add method to disable the auto-write-out behaviour.
193
# This is done to avoid a revision_count * time-to-write additional overhead on
195
new_inventory_vf._check_write_ok()
196
Weave._add_lines(new_inventory_vf, rev_id, parents,
197
self.inventory.get_lines(rev_id), None, None, None, False, True)
199
new_inventory_vf.add_lines(rev_id, parents, self.inventory.get_lines(rev_id))
201
if isinstance(new_inventory_vf, WeaveFile):
202
new_inventory_vf._save()
203
# if this worked, the set of new_inventory_vf.names should equal
205
assert set(new_inventory_vf.versions()) == self.pending
206
self.pb.update('Writing weave')
207
self.repo.control_weaves.copy(new_inventory_vf, 'inventory', self.repo.get_transaction())
208
self.repo.control_weaves.delete('inventory.new', self.repo.get_transaction())
209
self.inventory = None
210
self.pb.note('Inventory regenerated.')
212
def _setup_steps(self, new_total):
213
"""Setup the markers we need to control the progress bar."""
214
self.total = new_total
217
def _graph_revision(self, rev_id):
218
"""Load a revision into the revision graph."""
219
# pick a random revision
220
# analyse revision id rev_id and put it in the stack.
221
self._reweave_step('loading revisions')
222
rev = self.repo.get_revision_reconcile(rev_id)
223
assert rev.revision_id == rev_id
225
for parent in rev.parent_ids:
226
if self._parent_is_available(parent):
227
parents.append(parent)
229
mutter('found ghost %s', parent)
230
self._rev_graph[rev_id] = parents
231
if self._parents_are_inconsistent(rev_id, parents):
232
self.inconsistent_parents += 1
233
mutter('Inconsistent inventory parents: id {%s} '
234
'inventory claims %r, '
235
'available parents are %r, '
236
'unavailable parents are %r',
238
set(self.inventory.get_parents(rev_id)),
240
set(rev.parent_ids).difference(set(parents)))
242
def _parents_are_inconsistent(self, rev_id, parents):
243
"""Return True if the parents list of rev_id does not match the weave.
245
This detects inconsistencies based on the self.thorough value:
246
if thorough is on, the first parent value is checked as well as ghost
248
Otherwise only the ghost differences are evaluated.
250
weave_parents = self.inventory.get_parents(rev_id)
251
weave_missing_old_ghosts = set(weave_parents) != set(parents)
252
first_parent_is_wrong = (
253
len(weave_parents) and len(parents) and
254
parents[0] != weave_parents[0])
256
return weave_missing_old_ghosts or first_parent_is_wrong
258
return weave_missing_old_ghosts
260
def _check_garbage_inventories(self):
261
"""Check for garbage inventories which we cannot trust
263
We cant trust them because their pre-requisite file data may not
264
be present - all we know is that their revision was not installed.
266
if not self.thorough:
268
inventories = set(self.inventory.versions())
269
revisions = set(self._rev_graph.keys())
270
garbage = inventories.difference(revisions)
271
self.garbage_inventories = len(garbage)
272
for revision_id in garbage:
273
mutter('Garbage inventory {%s} found.', revision_id)
275
def _parent_is_available(self, parent):
276
"""True if parent is a fully available revision
278
A fully available revision has a inventory and a revision object in the
281
return (parent in self._rev_graph or
282
(parent in self.inventory and self.repo.has_revision(parent)))
284
def _reweave_step(self, message):
285
"""Mark a single step of regeneration complete."""
286
self.pb.update(message, self.count, self.total)
290
class KnitReconciler(RepoReconciler):
291
"""Reconciler that reconciles a knit format repository.
293
This will detect garbage inventories and remove them in thorough mode.
296
def _reconcile_steps(self):
297
"""Perform the steps to reconcile this repository."""
301
except errors.BzrCheckError:
304
# knits never suffer this
306
self._fix_text_parents()
308
def _load_indexes(self):
309
"""Load indexes for the reconciliation."""
310
self.transaction = self.repo.get_transaction()
311
self.pb.update('Reading indexes.', 0, 2)
312
self.inventory = self.repo.get_inventory_weave()
313
self.pb.update('Reading indexes.', 1, 2)
314
self.repo._check_for_inconsistent_revision_parents()
315
self.revisions = self.repo._revision_store.get_revision_file(self.transaction)
316
self.pb.update('Reading indexes.', 2, 2)
318
def _gc_inventory(self):
319
"""Remove inventories that are not referenced from the revision store."""
320
self.pb.update('Checking unused inventories.', 0, 1)
321
self._check_garbage_inventories()
322
self.pb.update('Checking unused inventories.', 1, 3)
323
if not self.garbage_inventories:
324
self.pb.note('Inventory ok.')
326
self.pb.update('Backing up inventory...', 0, 0)
327
self.repo.control_weaves.copy(self.inventory, 'inventory.backup', self.transaction)
328
self.pb.note('Backup Inventory created.')
329
# asking for '' should never return a non-empty weave
330
new_inventory_vf = self.repo.control_weaves.get_empty('inventory.new',
333
# we have topological order of revisions and non ghost parents ready.
334
self._setup_steps(len(self.revisions))
335
for rev_id in TopoSorter(self.revisions.get_graph().items()).iter_topo_order():
336
parents = self.revisions.get_parents(rev_id)
337
# double check this really is in topological order.
338
unavailable = [p for p in parents if p not in new_inventory_vf]
339
assert len(unavailable) == 0
340
# this entry has all the non ghost parents in the inventory
342
self._reweave_step('adding inventories')
343
# ugly but needed, weaves are just way tooooo slow else.
344
new_inventory_vf.add_lines(rev_id, parents, self.inventory.get_lines(rev_id))
346
# if this worked, the set of new_inventory_vf.names should equal
348
assert set(new_inventory_vf.versions()) == set(self.revisions.versions())
349
self.pb.update('Writing weave')
350
self.repo.control_weaves.copy(new_inventory_vf, 'inventory', self.transaction)
351
self.repo.control_weaves.delete('inventory.new', self.transaction)
352
self.inventory = None
353
self.pb.note('Inventory regenerated.')
355
def _check_garbage_inventories(self):
356
"""Check for garbage inventories which we cannot trust
358
We cant trust them because their pre-requisite file data may not
359
be present - all we know is that their revision was not installed.
361
inventories = set(self.inventory.versions())
362
revisions = set(self.revisions.versions())
363
garbage = inventories.difference(revisions)
364
self.garbage_inventories = len(garbage)
365
for revision_id in garbage:
366
mutter('Garbage inventory {%s} found.', revision_id)
368
def _fix_text_parents(self):
369
"""Fix bad versionedfile parent entries.
371
It is possible for the parents entry in a versionedfile entry to be
372
inconsistent with the values in the revision and inventory.
374
This method finds entries with such inconsistencies, corrects their
375
parent lists, and replaces the versionedfile with a corrected version.
377
transaction = self.repo.get_transaction()
378
revision_versions = repository._RevisionTextVersionCache(self.repo)
379
versions = self.revisions.versions()
380
mutter('Prepopulating revision text cache with %d revisions',
382
revision_versions.prepopulate_revs(versions)
383
used_file_versions = revision_versions.used_file_versions()
384
for num, file_id in enumerate(self.repo.weave_store):
385
self.pb.update('Fixing text parents', num,
386
len(self.repo.weave_store))
387
vf = self.repo.weave_store.get_weave(file_id, transaction)
388
vf_checker = self.repo.get_versioned_file_checker(
389
vf.versions(), revision_versions)
390
versions_with_bad_parents, dangling_file_versions = \
391
vf_checker.check_file_version_parents(vf, file_id)
392
if (len(versions_with_bad_parents) == 0 and
393
len(dangling_file_versions) == 0):
395
full_text_versions = set()
396
unused_versions = set()
397
for dangling_version in dangling_file_versions:
398
version = dangling_version[1]
399
if dangling_version in used_file_versions:
400
# This version *is* used by some revision, even though it
401
# isn't used by its own revision! We make sure any
402
# revision referencing it is stored as a fulltext
403
# This avoids bug 155730: it means that clients looking at
404
# inventories to determine the versions to fetch will not
405
# miss a required version. (So clients can assume that if
406
# they have a complete revision graph, and fetch all file
407
# versions named by those revisions inventories, then they
408
# will not have any missing parents for 'delta' knit
410
# XXX: A better, but more difficult and slower fix would be
411
# to rewrite the inventories referencing this version.
412
full_text_versions.add(version)
414
# This version is totally unreferenced. It should be
416
unused_versions.add(version)
417
self._fix_text_parent(file_id, vf, versions_with_bad_parents,
418
full_text_versions, unused_versions)
420
def _fix_text_parent(self, file_id, vf, versions_with_bad_parents,
421
full_text_versions, unused_versions):
422
"""Fix bad versionedfile entries in a single versioned file."""
423
mutter('fixing text parent: %r (%d versions)', file_id,
424
len(versions_with_bad_parents))
425
mutter('(%d need to be full texts, %d are unused)',
426
len(full_text_versions), len(unused_versions))
427
new_vf = self.repo.weave_store.get_empty('temp:%s' % file_id,
430
for version in vf.versions():
431
if version in versions_with_bad_parents:
432
parents = versions_with_bad_parents[version][1]
434
parents = vf.get_parents(version)
435
new_parents[version] = parents
436
for version in TopoSorter(new_parents.items()).iter_topo_order():
437
if version in unused_versions:
439
lines = vf.get_lines(version)
440
parents = new_parents[version]
441
if parents and (parents[0] in full_text_versions):
442
# Force this record to be a fulltext, not a delta.
443
new_vf._add(version, lines, parents, False,
444
None, None, None, False)
446
new_vf.add_lines(version, parents, lines)
447
self.repo.weave_store.copy(new_vf, file_id, self.transaction)
448
self.repo.weave_store.delete('temp:%s' % file_id, self.transaction)
451
class PackReconciler(RepoReconciler):
452
"""Reconciler that reconciles a pack based repository.
454
Garbage inventories do not affect ancestry queries, and removal is
455
considerably more expensive as there is no separate versioned file for
456
them, so they are not cleaned. In short it is currently a no-op.
458
In future this may be a good place to hook in annotation cache checking,
459
index recreation etc.
462
# XXX: The index corruption that _fix_text_parents performs is needed for
463
# packs, but not yet implemented. The basic approach is to:
464
# - lock the names list
465
# - perform a customised pack() that regenerates data as needed
466
# - unlock the names list
467
# https://bugs.edge.launchpad.net/bzr/+bug/154173
469
def _reconcile_steps(self):
470
"""Perform the steps to reconcile this repository."""
471
if not self.thorough:
473
collection = self.repo._pack_collection
474
collection.ensure_loaded()
475
collection.lock_names()
477
packs = collection.all_packs()
478
all_revisions = self.repo.all_revision_ids()
479
total_inventories = len(list(
480
collection.inventory_index.combined_index.iter_all_entries()))
481
if len(all_revisions):
482
self._packer = repofmt.pack_repo.ReconcilePacker(
483
collection, packs, ".reconcile", all_revisions)
484
new_pack = self._packer.pack(pb=self.pb)
485
if new_pack is not None:
486
self._discard_packs(packs)
488
# only make a new pack when there is data to copy.
489
self._discard_packs(packs)
490
self.garbage_inventories = total_inventories - len(list(
491
collection.inventory_index.combined_index.iter_all_entries()))
493
collection._unlock_names()
495
def _discard_packs(self, packs):
496
"""Discard some packs from the repository.
498
This removes them from the memory index and renames them into the
499
obsolete packs directory.
500
:param packs: The packs to discard.
503
self.repo._pack_collection._remove_pack_from_memory(pack)
504
self.repo._pack_collection._save_pack_names()
505
self.repo._pack_collection._obsolete_packs(packs)