/brz/remove-bazaar : contents of revisionloader.py at revision 0.64.74

: (revision 0.64.74)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/brz/remove-bazaar

# Copyright (C) 2008 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""Parameterised loading of revisions into a repository."""


from bzrlib import errors, knit, lru_cache, osutils
from bzrlib import revision as _mod_revision


class RevisionLoader(object):
    # NOTE: This is effectively bzrlib.repository._install_revision
    # refactored to be a class. When importing, we want more flexibility
    # in how previous revisions are cached, data is feed in, etc.

    def __init__(self, repo):
        """An object responsible for loading revisions into a repository.

        NOTE: Repository locking is not managed by this class. Clients
        should take a write lock, call load() multiple times, then release
        the lock.

        :param repository: the target repository
        """
        self.repo = repo

    def load(self, rev, inv, signature, text_provider,
        inventories_provider=None):
        """Load a revision into a repository.

        :param rev: the Revision
        :param inv: the inventory
        :param signature: signing information
        :param text_provider: a callable expecting a file_id parameter
            that returns the text for that file-id
        :param inventories_provider: a callable expecting a repository and
            a list of revision-ids, that returns:
              * the list of revision-ids present in the repository
              * the list of inventories for the revision-id's,
                including an empty inventory for the missing revisions
            If None, a default implementation is provided.
        """
        if inventories_provider is None:
            inventories_provider = self._default_inventories_provider
        present_parents, parent_invs = inventories_provider(rev.parent_ids)
        self._load_texts(rev.revision_id, inv.iter_entries(), parent_invs,
            text_provider)
        try:
            rev.inventory_sha1 = self._add_inventory(rev.revision_id,
                inv, present_parents)
        except errors.RevisionAlreadyPresent:
            pass
        if signature is not None:
            self.repo.add_signature_text(rev.revision_id, signature)
        self.repo.add_revision(rev.revision_id, rev, inv)

    def _load_texts(self, revision_id, entries, parent_invs, text_provider):
        """Load texts to a repository for inventory entries.
        
        This method is provided for subclasses to use or override.

        :param revision_id: the revision identifier
        :param entries: iterator over the inventory entries
        :param parent_inv: the parent inventories
        :param text_provider: a callable expecting a file_id parameter
            that returns the text for that file-id
        """

        # Backwards compatibility hack: skip the root id.
        if not self.repo.supports_rich_root():
            path, root = entries.next()
            if root.revision != revision_id:
                raise errors.IncompatibleRevision(repr(self.repo))
        # Add the texts that are not already present
        tx = self.repo.get_transaction()
        for path, ie in entries:
            # This test is *really* slow: over 50% of import time
            #w = self.repo.weave_store.get_weave_or_empty(ie.file_id, tx)
            #if ie.revision in w:
            #    continue
            # Try another way, realising that this assumes that the
            # version is not already there. In the general case,
            # a shared repository might already have the revision but
            # we arguably don't need that check when importing from
            # a foreign system.
            if ie.revision != revision_id:
                continue
            text_parents = []
            for parent_inv in parent_invs:
                if ie.file_id not in parent_inv:
                    continue
                parent_id = parent_inv[ie.file_id].revision
                if parent_id in text_parents:
                    continue
                text_parents.append(parent_id)
            vfile = self.repo.weave_store.get_weave_or_empty(ie.file_id,  tx)
            lines = text_provider(ie.file_id)
            vfile.add_lines(revision_id, text_parents, lines)

    def _add_inventory(self, revision_id, inv, parents):
        """Add the inventory inv to the repository as revision_id.
        
        :param parents: The revision ids of the parents that revision_id
                        is known to have and are in the repository already.

        :returns: The validator(which is a sha1 digest, though what is sha'd is
            repository format specific) of the serialized inventory.
        """
        return self.repo.add_inventory(revision_id, inv, parents)

    def _default_inventories_provider(self, revision_ids):
        """An inventories provider that queries the repository."""
        present = []
        inventories = []
        for revision_id in revision_ids:
            if self.repo.has_revision(revision_id):
                present.append(revision_id)
                rev_tree = self.repo.revision_tree(revision_id)
            else:
                rev_tree = self.repo.revision_tree(None)
            inventories.append(rev_tree.inventory)
        return present, inventories


class ImportRevisionLoader(RevisionLoader):
    """A RevisionLoader optimised for importing.
        
    This implementation caches serialised inventory texts.
    """

    def __init__(self, repo, parent_texts_to_cache=1, random_ids=True):
        """See RevisionLoader.__init__.

        :param repository: the target repository
        :param parent_text_to_cache: the number of parent texts to cache
        """
        RevisionLoader.__init__(self, repo)
        self.inv_parent_texts = lru_cache.LRUCache(parent_texts_to_cache)
        self.random_ids = random_ids

    def _add_inventory(self, revision_id, inv, parents):
        """See RevisionLoader._add_inventory."""
        # Code taken from bzrlib.repository.add_inventory
        assert self.repo.is_in_write_group()
        _mod_revision.check_not_reserved_id(revision_id)
        assert inv.revision_id is None or inv.revision_id == revision_id, \
            "Mismatch between inventory revision" \
            " id and insertion revid (%r, %r)" % (inv.revision_id, revision_id)
        assert inv.root is not None
        inv_lines = self.repo._serialise_inventory_to_lines(inv)
        inv_vf = self.repo.get_inventory_weave()

        sha1, num_bytes, parent_text = self._inventory_add_lines(inv_vf,
            revision_id, parents, inv_lines, self.inv_parent_texts)
        self.inv_parent_texts[revision_id] = parent_text
        return sha1

    def _inventory_add_lines(self, inv_vf, version_id, parents, lines,
            parent_texts):
        """See Repository._inventory_add_lines()."""
        final_parents = []
        for parent in parents:
            if parent in inv_vf:
                final_parents.append(parent)
        return inv_vf.add_lines(version_id, final_parents, lines, parent_texts,
            random_id=self.random_ids, check_content=False)


class ExperimentalRevisionLoader(ImportRevisionLoader):
    """A RevisionLoader over optimised for importing.
        
    WARNING: This implementation uses undoumented bzrlib internals.
    It may not work in the future. In fact, it may not work now as
    it is an incubator for experimental code.
    """

    def __init__(self, repo, parent_texts_to_cache=1, fulltext_every=200):
        """See ImportRevisionLoader.__init__.
        
        :para fulltext_every: how often to store an inventory fulltext
        """
        ImportRevisionLoader.__init__(self, repo, parent_texts_to_cache)
        self.revision_count = 0
        self.fulltext_every = fulltext_every

    def _inventory_add_lines(self, inv_vf, version_id, parents, lines,
            parent_texts):
        """See Repository._inventory_add_lines()."""
        # setup parameters used in original code but not this API
        self.revision_count += 1
        if self.revision_count % self.fulltext_every == 0:
            delta = False
        else:
            delta = inv_vf.delta
        left_matching_blocks = None
        random_id = self.random_ids
        check_content = False

        # bzrlib.knit.add_lines() but error checking optimised
        inv_vf._check_add(version_id, lines, random_id, check_content)

        ####################################################################
        # bzrlib.knit._add() but skip checking if fulltext better than delta
        ####################################################################

        line_bytes = ''.join(lines)
        digest = osutils.sha_string(line_bytes)
        present_parents = []
        for parent in parents:
            if inv_vf.has_version(parent):
                present_parents.append(parent)
        if parent_texts is None:
            parent_texts = {}

        # can only compress against the left most present parent.
        if (delta and
            (len(present_parents) == 0 or
             present_parents[0] != parents[0])):
            delta = False

        text_length = len(line_bytes)
        options = []
        if lines:
            if lines[-1][-1] != '\n':
                # copy the contents of lines.
                lines = lines[:]
                options.append('no-eol')
                lines[-1] = lines[-1] + '\n'
                line_bytes += '\n'

        #if delta:
        #    # To speed the extract of texts the delta chain is limited
        #    # to a fixed number of deltas.  This should minimize both
        #    # I/O and the time spend applying deltas.
        #    delta = inv_vf._check_should_delta(present_parents)

        assert isinstance(version_id, str)
        content = inv_vf.factory.make(lines, version_id)
        if delta or (inv_vf.factory.annotated and len(present_parents) > 0):
            # Merge annotations from parent texts if needed.
            delta_hunks = inv_vf._merge_annotations(content, present_parents,
                parent_texts, delta, inv_vf.factory.annotated,
                left_matching_blocks)

        if delta:
            options.append('line-delta')
            store_lines = inv_vf.factory.lower_line_delta(delta_hunks)
            size, bytes = inv_vf._data._record_to_data(version_id, digest,
                store_lines)
        else:
            options.append('fulltext')
            # isinstance is slower and we have no hierarchy.
            if inv_vf.factory.__class__ == knit.KnitPlainFactory:
                # Use the already joined bytes saving iteration time in
                # _record_to_data.
                size, bytes = inv_vf._data._record_to_data(version_id, digest,
                    lines, [line_bytes])
            else:
                # get mixed annotation + content and feed it into the
                # serialiser.
                store_lines = inv_vf.factory.lower_fulltext(content)
                size, bytes = inv_vf._data._record_to_data(version_id, digest,
                    store_lines)

        access_memo = inv_vf._data.add_raw_records([size], bytes)[0]
        inv_vf._index.add_versions(
            ((version_id, options, access_memo, parents),),
            random_id=random_id)
        return digest, text_length, content

0.64.5 by Ian Clatworthy first cut at generic processing method	1	# Copyright (C) 2008 Canonical Ltd
	2	#
	3	# This program is free software; you can redistribute it and/or modify
	4	# it under the terms of the GNU General Public License as published by
	5	# the Free Software Foundation; either version 2 of the License, or
	6	# (at your option) any later version.
	7	#
	8	# This program is distributed in the hope that it will be useful,
	9	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	# GNU General Public License for more details.
	12	#
	13	# You should have received a copy of the GNU General Public License
	14	# along with this program; if not, write to the Free Software
	15	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	16
	17	"""Parameterised loading of revisions into a repository."""
	18
	19
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	20	from bzrlib import errors, knit, lru_cache, osutils
0.64.44 by Ian Clatworthy smart caching of serialised inventories	21	from bzrlib import revision as _mod_revision
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	22
	23
0.64.5 by Ian Clatworthy first cut at generic processing method	24	class RevisionLoader(object):
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	25	# NOTE: This is effectively bzrlib.repository._install_revision
	26	# refactored to be a class. When importing, we want more flexibility
	27	# in how previous revisions are cached, data is feed in, etc.
0.64.5 by Ian Clatworthy first cut at generic processing method	28
0.64.48 by Ian Clatworthy one revision loader instance	29	def __init__(self, repo):
0.64.5 by Ian Clatworthy first cut at generic processing method	30	"""An object responsible for loading revisions into a repository.
	31
	32	NOTE: Repository locking is not managed by this class. Clients
	33	should take a write lock, call load() multiple times, then release
	34	the lock.
	35
	36	:param repository: the target repository
0.64.48 by Ian Clatworthy one revision loader instance	37	"""
	38	self.repo = repo
	39
	40	def load(self, rev, inv, signature, text_provider,
	41	inventories_provider=None):
	42	"""Load a revision into a repository.
	43
	44	:param rev: the Revision
	45	:param inv: the inventory
	46	:param signature: signing information
	47	:param text_provider: a callable expecting a file_id parameter
	48	that returns the text for that file-id
0.64.5 by Ian Clatworthy first cut at generic processing method	49	:param inventories_provider: a callable expecting a repository and
	50	a list of revision-ids, that returns:
	51	* the list of revision-ids present in the repository
	52	* the list of inventories for the revision-id's,
	53	including an empty inventory for the missing revisions
	54	If None, a default implementation is provided.
	55	"""
0.64.48 by Ian Clatworthy one revision loader instance	56	if inventories_provider is None:
	57	inventories_provider = self._default_inventories_provider
	58	present_parents, parent_invs = inventories_provider(rev.parent_ids)
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	59	self._load_texts(rev.revision_id, inv.iter_entries(), parent_invs,
0.64.5 by Ian Clatworthy first cut at generic processing method	60	text_provider)
	61	try:
0.64.44 by Ian Clatworthy smart caching of serialised inventories	62	rev.inventory_sha1 = self._add_inventory(rev.revision_id,
0.64.6 by Ian Clatworthy generic processing method working for one revision in one branch	63	inv, present_parents)
0.64.5 by Ian Clatworthy first cut at generic processing method	64	except errors.RevisionAlreadyPresent:
	65	pass
	66	if signature is not None:
	67	self.repo.add_signature_text(rev.revision_id, signature)
	68	self.repo.add_revision(rev.revision_id, rev, inv)
	69
	70	def _load_texts(self, revision_id, entries, parent_invs, text_provider):
	71	"""Load texts to a repository for inventory entries.
	72
	73	This method is provided for subclasses to use or override.
	74
	75	:param revision_id: the revision identifier
	76	:param entries: iterator over the inventory entries
	77	:param parent_inv: the parent inventories
	78	:param text_provider: a callable expecting a file_id parameter
	79	that returns the text for that file-id
	80	"""
	81
	82	# Backwards compatibility hack: skip the root id.
	83	if not self.repo.supports_rich_root():
	84	path, root = entries.next()
	85	if root.revision != revision_id:
	86	raise errors.IncompatibleRevision(repr(self.repo))
	87	# Add the texts that are not already present
0.64.27 by Ian Clatworthy 1st cut at performance tuning	88	tx = self.repo.get_transaction()
0.64.5 by Ian Clatworthy first cut at generic processing method	89	for path, ie in entries:
0.64.27 by Ian Clatworthy 1st cut at performance tuning	90	# This test is really slow: over 50% of import time
	91	#w = self.repo.weave_store.get_weave_or_empty(ie.file_id, tx)
	92	#if ie.revision in w:
	93	# continue
0.64.29 by Ian Clatworthy improve explanation of faster check in revisionloader	94	# Try another way, realising that this assumes that the
	95	# version is not already there. In the general case,
	96	# a shared repository might already have the revision but
	97	# we arguably don't need that check when importing from
	98	# a foreign system.
0.64.27 by Ian Clatworthy 1st cut at performance tuning	99	if ie.revision != revision_id:
	100	continue
	101	text_parents = []
	102	for parent_inv in parent_invs:
	103	if ie.file_id not in parent_inv:
	104	continue
	105	parent_id = parent_inv[ie.file_id].revision
	106	if parent_id in text_parents:
	107	continue
	108	text_parents.append(parent_id)
	109	vfile = self.repo.weave_store.get_weave_or_empty(ie.file_id, tx)
	110	lines = text_provider(ie.file_id)
	111	vfile.add_lines(revision_id, text_parents, lines)
0.64.5 by Ian Clatworthy first cut at generic processing method	112
0.64.44 by Ian Clatworthy smart caching of serialised inventories	113	def _add_inventory(self, revision_id, inv, parents):
	114	"""Add the inventory inv to the repository as revision_id.
	115
	116	:param parents: The revision ids of the parents that revision_id
	117	is known to have and are in the repository already.
	118
	119	:returns: The validator(which is a sha1 digest, though what is sha'd is
	120	repository format specific) of the serialized inventory.
	121	"""
	122	return self.repo.add_inventory(revision_id, inv, parents)
	123
0.64.5 by Ian Clatworthy first cut at generic processing method	124	def _default_inventories_provider(self, revision_ids):
	125	"""An inventories provider that queries the repository."""
	126	present = []
	127	inventories = []
	128	for revision_id in revision_ids:
	129	if self.repo.has_revision(revision_id):
	130	present.append(revision_id)
	131	rev_tree = self.repo.revision_tree(revision_id)
	132	else:
	133	rev_tree = self.repo.revision_tree(None)
	134	inventories.append(rev_tree.inventory)
	135	return present, inventories
0.64.44 by Ian Clatworthy smart caching of serialised inventories	136
	137
	138	class ImportRevisionLoader(RevisionLoader):
	139	"""A RevisionLoader optimised for importing.
	140
0.64.48 by Ian Clatworthy one revision loader instance	141	This implementation caches serialised inventory texts.
0.64.44 by Ian Clatworthy smart caching of serialised inventories	142	"""
	143
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	144	def __init__(self, repo, parent_texts_to_cache=1, random_ids=True):
0.64.48 by Ian Clatworthy one revision loader instance	145	"""See RevisionLoader.__init__.
	146
	147	:param repository: the target repository
	148	:param parent_text_to_cache: the number of parent texts to cache
	149	"""
	150	RevisionLoader.__init__(self, repo)
	151	self.inv_parent_texts = lru_cache.LRUCache(parent_texts_to_cache)
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	152	self.random_ids = random_ids
0.64.44 by Ian Clatworthy smart caching of serialised inventories	153
	154	def _add_inventory(self, revision_id, inv, parents):
	155	"""See RevisionLoader._add_inventory."""
	156	# Code taken from bzrlib.repository.add_inventory
	157	assert self.repo.is_in_write_group()
	158	_mod_revision.check_not_reserved_id(revision_id)
	159	assert inv.revision_id is None or inv.revision_id == revision_id, \
	160	"Mismatch between inventory revision" \
	161	" id and insertion revid (%r, %r)" % (inv.revision_id, revision_id)
	162	assert inv.root is not None
	163	inv_lines = self.repo._serialise_inventory_to_lines(inv)
	164	inv_vf = self.repo.get_inventory_weave()
	165
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	166	sha1, num_bytes, parent_text = self._inventory_add_lines(inv_vf,
	167	revision_id, parents, inv_lines, self.inv_parent_texts)
	168	self.inv_parent_texts[revision_id] = parent_text
	169	return sha1
	170
	171	def _inventory_add_lines(self, inv_vf, version_id, parents, lines,
	172	parent_texts):
	173	"""See Repository._inventory_add_lines()."""
0.64.44 by Ian Clatworthy smart caching of serialised inventories	174	final_parents = []
	175	for parent in parents:
	176	if parent in inv_vf:
	177	final_parents.append(parent)
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	178	return inv_vf.add_lines(version_id, final_parents, lines, parent_texts,
	179	random_id=self.random_ids, check_content=False)
	180
	181
	182	class ExperimentalRevisionLoader(ImportRevisionLoader):
	183	"""A RevisionLoader over optimised for importing.
	184
	185	WARNING: This implementation uses undoumented bzrlib internals.
	186	It may not work in the future. In fact, it may not work now as
0.64.52 by Ian Clatworthy switch on experimental mode by default	187	it is an incubator for experimental code.
0.64.49 by Ian Clatworthy skip check re fulltext storage better than delta for inventories when in experimental mode	188	"""
	189
	190	def __init__(self, repo, parent_texts_to_cache=1, fulltext_every=200):
	191	"""See ImportRevisionLoader.__init__.
	192
	193	:para fulltext_every: how often to store an inventory fulltext
	194	"""
	195	ImportRevisionLoader.__init__(self, repo, parent_texts_to_cache)
	196	self.revision_count = 0
	197	self.fulltext_every = fulltext_every
	198
	199	def _inventory_add_lines(self, inv_vf, version_id, parents, lines,
	200	parent_texts):
	201	"""See Repository._inventory_add_lines()."""
	202	# setup parameters used in original code but not this API
	203	self.revision_count += 1
	204	if self.revision_count % self.fulltext_every == 0:
	205	delta = False
	206	else:
	207	delta = inv_vf.delta
	208	left_matching_blocks = None
	209	random_id = self.random_ids
	210	check_content = False
	211
	212	# bzrlib.knit.add_lines() but error checking optimised
	213	inv_vf._check_add(version_id, lines, random_id, check_content)
	214
	215	####################################################################
	216	# bzrlib.knit._add() but skip checking if fulltext better than delta
	217	####################################################################
	218
	219	line_bytes = ''.join(lines)
	220	digest = osutils.sha_string(line_bytes)
	221	present_parents = []
	222	for parent in parents:
	223	if inv_vf.has_version(parent):
	224	present_parents.append(parent)
	225	if parent_texts is None:
	226	parent_texts = {}
	227
	228	# can only compress against the left most present parent.
	229	if (delta and
	230	(len(present_parents) == 0 or
	231	present_parents[0] != parents[0])):
	232	delta = False
	233
	234	text_length = len(line_bytes)
	235	options = []
	236	if lines:
	237	if lines[-1][-1] != '\n':
	238	# copy the contents of lines.
	239	lines = lines[:]
	240	options.append('no-eol')
	241	lines[-1] = lines[-1] + '\n'
	242	line_bytes += '\n'
	243
	244	#if delta:
	245	# # To speed the extract of texts the delta chain is limited
	246	# # to a fixed number of deltas. This should minimize both
	247	# # I/O and the time spend applying deltas.
	248	# delta = inv_vf._check_should_delta(present_parents)
	249
	250	assert isinstance(version_id, str)
	251	content = inv_vf.factory.make(lines, version_id)
252	if delta or (inv_vf.factory.annotated and len(present_parents) > 0):
253	# Merge annotations from parent texts if needed.
254	delta_hunks = inv_vf._merge_annotations(content, present_parents,
255	parent_texts, delta, inv_vf.factory.annotated,
256	left_matching_blocks)
257
258	if delta:
259	options.append('line-delta')
260	store_lines = inv_vf.factory.lower_line_delta(delta_hunks)
261	size, bytes = inv_vf._data._record_to_data(version_id, digest,
262	store_lines)
263	else:
264	options.append('fulltext')
265	# isinstance is slower and we have no hierarchy.
266	if inv_vf.factory.__class__ == knit.KnitPlainFactory:
267	# Use the already joined bytes saving iteration time in
268	# _record_to_data.
269	size, bytes = inv_vf._data._record_to_data(version_id, digest,
270	lines, [line_bytes])
271	else:
272	# get mixed annotation + content and feed it into the
273	# serialiser.
274	store_lines = inv_vf.factory.lower_fulltext(content)
275	size, bytes = inv_vf._data._record_to_data(version_id, digest,
276	store_lines)
277
278	access_memo = inv_vf._data.add_raw_records([size], bytes)[0]
279	inv_vf._index.add_versions(
280	((version_id, options, access_memo, parents),),
281	random_id=random_id)
282	return digest, text_length, content