/loggerhead/trunk

To get this branch, use:
bzr branch http://gegoxaren.bato24.eu/bzr/loggerhead/trunk
42 by Robey Pointer
add text substring indexer
1
#
2
# Copyright (C) 2006  Robey Pointer <robey@lag.net>
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
#
18
19
"""
20
indexing of the comment text of revisions, for fast searching.
21
22
two separate 'shelve' files are created:
23
24
    - recorded: revid -> 1 (if the revid is indexed)
25
    - index: 3-letter substring -> list(revids)
26
"""
27
28
import logging
29
import os
30
import re
31
import shelve
32
import threading
33
34
from loggerhead import util
35
from loggerhead.util import decorator
36
37
log = logging.getLogger("loggerhead.controllers")
38
39
# if any substring index reaches this many revids, replace the entry with
40
# an ALL marker -- it's not worth an explicit index.
41
ALL_THRESHOLD = 1000
42
ALL = 'ALL'
43
44
45
@decorator
46
def with_lock(unbound):
47
    def locked(self, *args, **kw):
48
        self._lock.acquire()
49
        try:
50
            return unbound(self, *args, **kw)
51
        finally:
52
            self._lock.release()
53
    return locked
54
55
56
def normalize_string(s):
57
    """
58
    remove any punctuation and normalize all whitespace to a single space.
59
    """
60
    s = util.to_utf8(s).lower()
61
    # remove apostrophes completely.
62
    s = re.sub(r"'", '', s)
63
    # convert other garbage into space
64
    s = re.sub(r'[^\w\d]', ' ', s)
65
    # compress multiple spaces into one.
66
    s = re.sub(r'\s{2,}', ' ', s)
67
    # and finally remove leading/trailing whitespace
68
    s = s.strip()
69
    return s
70
71
72
class TextIndex (object):
73
    def __init__(self, history, cache_path):
74
        self.history = history
75
        
76
        if not os.path.exists(cache_path):
77
            os.mkdir(cache_path)
78
        
79
        recorded_filename = os.path.join(cache_path, 'textindex-recorded')
80
        index_filename = os.path.join(cache_path, 'textindex')
81
        
82
        self._recorded = shelve.open(recorded_filename, 'c', protocol=2)
83
        self._index = shelve.open(index_filename, 'c', protocol=2)
84
        
85
        self._lock = threading.RLock()
86
        
87
        log.info('Using search index; %d entries.', len(self._recorded))
88
    
89
    @with_lock
90
    def is_indexed(self, revid):
91
        return self._recorded.get(util.to_utf8(revid), None) is not None
92
    
93
    @with_lock
94
    def __len__(self):
95
        return len(self._recorded)
96
97
    @with_lock
98
    def flush(self):
99
        self._recorded.sync()
100
        self._index.sync()
101
    
102
    @with_lock
103
    def index_change(self, change):
104
        """
105
        currently, only indexes the 'comment' field.
106
        """
107
        comment = normalize_string(change.comment)
108
        if len(comment) < 3:
109
            return
110
        for i in xrange(len(comment) - 2):
111
            sub = comment[i:i + 3]
112
            revid_set = self._index.get(sub, None)
113
            if revid_set is None:
114
                revid_set = set()
115
            elif revid_set == ALL:
116
                # this entry got too big
117
                continue
118
            revid_set.add(change.revid)
119
            if len(revid_set) > ALL_THRESHOLD:
120
                revid_set = ALL
121
            self._index[sub] = revid_set
122
        
123
        self._recorded[util.to_utf8(change.revid)] = True
124
        return
125
    
126
    @with_lock
127
    def find(self, text, revid_list=None):
128
        text = normalize_string(text)
129
        if len(text) < 3:
130
            return []
131
132
        total_set = None
133
        if revid_list is not None:
134
            total_set = set(revid_list)
135
        seen_all = False
136
        
137
        for i in xrange(len(text) - 2):
138
            sub = text[i:i + 3]
139
            revid_set = self._index.get(sub, None)
140
            if revid_set is None:
141
                # zero matches, stop here.
142
                return []
143
            if revid_set == ALL:
144
                # skip
145
                seen_all = True
146
                continue
147
            if total_set is None:
148
                total_set = revid_set
149
            else:
150
                total_set.intersection_update(revid_set)
151
            if len(total_set) == 0:
152
                return []
153
        
154
        # tricky: if seen_all is True, one of the substring indices was ALL
155
        # (in other words, unindexed), so our results are actually a superset
156
        # of the exact answer.
157
        #
158
        # if we cared, we could do a direct match on the result set and cull
159
        # out any that aren't actually matches.  for now, i'm gonna say that
160
        # we DON'T care, and if one of the substrings hit ALL, there's a small
161
        # chance that we'll give a few false positives, and we don't care.
162
        return total_set
163