/loggerhead/trunk : contents of loggerhead/textindex.py at revision 42

: (revision 42)

To get this branch, use:

bzr branch
http://gegoxaren.bato24.eu/bzr/loggerhead/trunk

#
# Copyright (C) 2006  Robey Pointer <robey@lag.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

"""
indexing of the comment text of revisions, for fast searching.

two separate 'shelve' files are created:

    - recorded: revid -> 1 (if the revid is indexed)
    - index: 3-letter substring -> list(revids)
"""

import logging
import os
import re
import shelve
import threading

from loggerhead import util
from loggerhead.util import decorator

log = logging.getLogger("loggerhead.controllers")

# if any substring index reaches this many revids, replace the entry with
# an ALL marker -- it's not worth an explicit index.
ALL_THRESHOLD = 1000
ALL = 'ALL'


@decorator
def with_lock(unbound):
    def locked(self, *args, **kw):
        self._lock.acquire()
        try:
            return unbound(self, *args, **kw)
        finally:
            self._lock.release()
    return locked


def normalize_string(s):
    """
    remove any punctuation and normalize all whitespace to a single space.
    """
    s = util.to_utf8(s).lower()
    # remove apostrophes completely.
    s = re.sub(r"'", '', s)
    # convert other garbage into space
    s = re.sub(r'[^\w\d]', ' ', s)
    # compress multiple spaces into one.
    s = re.sub(r'\s{2,}', ' ', s)
    # and finally remove leading/trailing whitespace
    s = s.strip()
    return s


class TextIndex (object):
    def __init__(self, history, cache_path):
        self.history = history
        
        if not os.path.exists(cache_path):
            os.mkdir(cache_path)
        
        recorded_filename = os.path.join(cache_path, 'textindex-recorded')
        index_filename = os.path.join(cache_path, 'textindex')
        
        self._recorded = shelve.open(recorded_filename, 'c', protocol=2)
        self._index = shelve.open(index_filename, 'c', protocol=2)
        
        self._lock = threading.RLock()
        
        log.info('Using search index; %d entries.', len(self._recorded))
    
    @with_lock
    def is_indexed(self, revid):
        return self._recorded.get(util.to_utf8(revid), None) is not None
    
    @with_lock
    def __len__(self):
        return len(self._recorded)

    @with_lock
    def flush(self):
        self._recorded.sync()
        self._index.sync()
    
    @with_lock
    def index_change(self, change):
        """
        currently, only indexes the 'comment' field.
        """
        comment = normalize_string(change.comment)
        if len(comment) < 3:
            return
        for i in xrange(len(comment) - 2):
            sub = comment[i:i + 3]
            revid_set = self._index.get(sub, None)
            if revid_set is None:
                revid_set = set()
            elif revid_set == ALL:
                # this entry got too big
                continue
            revid_set.add(change.revid)
            if len(revid_set) > ALL_THRESHOLD:
                revid_set = ALL
            self._index[sub] = revid_set
        
        self._recorded[util.to_utf8(change.revid)] = True
        return
    
    @with_lock
    def find(self, text, revid_list=None):
        text = normalize_string(text)
        if len(text) < 3:
            return []

        total_set = None
        if revid_list is not None:
            total_set = set(revid_list)
        seen_all = False
        
        for i in xrange(len(text) - 2):
            sub = text[i:i + 3]
            revid_set = self._index.get(sub, None)
            if revid_set is None:
                # zero matches, stop here.
                return []
            if revid_set == ALL:
                # skip
                seen_all = True
                continue
            if total_set is None:
                total_set = revid_set
            else:
                total_set.intersection_update(revid_set)
            if len(total_set) == 0:
                return []
        
        # tricky: if seen_all is True, one of the substring indices was ALL
        # (in other words, unindexed), so our results are actually a superset
        # of the exact answer.
        #
        # if we cared, we could do a direct match on the result set and cull
        # out any that aren't actually matches.  for now, i'm gonna say that
        # we DON'T care, and if one of the substrings hit ALL, there's a small
        # chance that we'll give a few false positives, and we don't care.
        return total_set
        

42 by Robey Pointer add text substring indexer	1	#
	2	# Copyright (C) 2006 Robey Pointer <robey@lag.net>
	3	#
	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 2 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program; if not, write to the Free Software
	16	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17	#
	18
	19	"""
	20	indexing of the comment text of revisions, for fast searching.
	21
	22	two separate 'shelve' files are created:
	23
	24	- recorded: revid -> 1 (if the revid is indexed)
	25	- index: 3-letter substring -> list(revids)
	26	"""
	27
	28	import logging
	29	import os
	30	import re
	31	import shelve
	32	import threading
	33
	34	from loggerhead import util
	35	from loggerhead.util import decorator
	36
	37	log = logging.getLogger("loggerhead.controllers")
	38
	39	# if any substring index reaches this many revids, replace the entry with
	40	# an ALL marker -- it's not worth an explicit index.
	41	ALL_THRESHOLD = 1000
	42	ALL = 'ALL'
	43
	44
	45	@decorator
	46	def with_lock(unbound):
	47	def locked(self, args, *kw):
	48	self._lock.acquire()
	49	try:
	50	return unbound(self, args, *kw)
	51	finally:
	52	self._lock.release()
	53	return locked
	54
	55
	56	def normalize_string(s):
	57	"""
	58	remove any punctuation and normalize all whitespace to a single space.
	59	"""
	60	s = util.to_utf8(s).lower()
	61	# remove apostrophes completely.
	62	s = re.sub(r"'", '', s)
	63	# convert other garbage into space
	64	s = re.sub(r'[^\w\d]', ' ', s)
65	# compress multiple spaces into one.
66	s = re.sub(r'\s{2,}', ' ', s)
67	# and finally remove leading/trailing whitespace
68	s = s.strip()
69	return s
70
71
72	class TextIndex (object):
73	def __init__(self, history, cache_path):
74	self.history = history
75
76	if not os.path.exists(cache_path):
77	os.mkdir(cache_path)
78
79	recorded_filename = os.path.join(cache_path, 'textindex-recorded')
80	index_filename = os.path.join(cache_path, 'textindex')
81
82	self._recorded = shelve.open(recorded_filename, 'c', protocol=2)
83	self._index = shelve.open(index_filename, 'c', protocol=2)
84
85	self._lock = threading.RLock()
86
87	log.info('Using search index; %d entries.', len(self._recorded))
88
89	@with_lock
90	def is_indexed(self, revid):
91	return self._recorded.get(util.to_utf8(revid), None) is not None
92
93	@with_lock
94	def __len__(self):
95	return len(self._recorded)
96
97	@with_lock
98	def flush(self):
99	self._recorded.sync()
100	self._index.sync()
101
102	@with_lock
103	def index_change(self, change):
104	"""
105	currently, only indexes the 'comment' field.
106	"""
107	comment = normalize_string(change.comment)
108	if len(comment) < 3:
109	return
110	for i in xrange(len(comment) - 2):
111	sub = comment[i:i + 3]
112	revid_set = self._index.get(sub, None)
113	if revid_set is None:
114	revid_set = set()
115	elif revid_set == ALL:
116	# this entry got too big
117	continue
118	revid_set.add(change.revid)
119	if len(revid_set) > ALL_THRESHOLD:
120	revid_set = ALL
121	self._index[sub] = revid_set
122
123	self._recorded[util.to_utf8(change.revid)] = True
124	return
125
126	@with_lock
127	def find(self, text, revid_list=None):
128	text = normalize_string(text)
129	if len(text) < 3:
130	return []
131
132	total_set = None
133	if revid_list is not None:
134	total_set = set(revid_list)
135	seen_all = False
136
137	for i in xrange(len(text) - 2):
138	sub = text[i:i + 3]
139	revid_set = self._index.get(sub, None)
140	if revid_set is None:
141	# zero matches, stop here.
142	return []
143	if revid_set == ALL:
144	# skip
145	seen_all = True
146	continue
147	if total_set is None:
148	total_set = revid_set
149	else:
150	total_set.intersection_update(revid_set)
151	if len(total_set) == 0:
152	return []
153
154	# tricky: if seen_all is True, one of the substring indices was ALL
155	# (in other words, unindexed), so our results are actually a superset
156	# of the exact answer.
157	#
158	# if we cared, we could do a direct match on the result set and cull
159	# out any that aren't actually matches. for now, i'm gonna say that
160	# we DON'T care, and if one of the substrings hit ALL, there's a small
161	# chance that we'll give a few false positives, and we don't care.
162	return total_set
163