comparison toolbox/search.py @ 0:b0942f44413f

import from git://github.com/mozilla/toolbox.git
author Jeff Hammel <k0scist@gmail.com>
date Sun, 11 May 2014 09:15:35 -0700
parents
children cabe97535057
comparison
equal deleted inserted replaced
-1:000000000000 0:b0942f44413f
1 import os
2 import shutil
3 import tempfile
4
5 from time import sleep
6 from whoosh import fields
7 from whoosh import index
8 from whoosh.query import And
9 from whoosh.query import Or
10 from whoosh.query import Term
11 from whoosh.qparser import QueryParser
12 from whoosh.index import LockError
13
14 class WhooshSearch(object):
15 """full-text search"""
16
17 def __init__(self, whoosh_index=None):
18 """
19 - whoosh_index : whoosh index directory
20 """
21 self.schema = fields.Schema(name=fields.ID(unique=True, stored=True),
22 description=fields.TEXT)
23 self.keywords = set([])
24 self.tempdir = False
25 if whoosh_index is None:
26 whoosh_index = tempfile.mkdtemp()
27 self.tempdir = True
28 if not os.path.exists(whoosh_index):
29 os.makedirs(whoosh_index)
30 self.index = whoosh_index
31 self.ix = index.create_in(self.index, self.schema)
32
33 def update(self, name, description, **kw):
34 """update a document"""
35
36 # forgivingly get the writer
37 timeout = 3. # seconds
38 ctr = 0.
39 incr = 0.2
40 while ctr < timeout:
41 try:
42 writer = self.ix.writer()
43 break
44 except LockError:
45 ctr += incr
46 sleep(incr)
47 else:
48 raise
49
50 # add keywords
51 for key in kw:
52 if key not in self.keywords:
53 writer.add_field(key, fields.KEYWORD)
54 self.keywords.add(key)
55 if not isinstance(kw[key], basestring):
56 kw[key] = ' '.join(kw[key])
57 kw[key] = unicode(kw[key])
58
59 # convert to unicode for whoosh
60 # really whoosh should do this for us
61 # and really python should be unicode-based :(
62 name = unicode(name)
63 description = unicode(description)
64
65 writer.update_document(name=name, description=description, **kw)
66 writer.commit()
67
68 def delete(self, name):
69 """delete a document of a given name"""
70 writer = self.ix.writer()
71 name = unicode(name)
72 writer.delete_by_term('name', name)
73 writer.commit()
74
75 def __call__(self, query):
76 """search"""
77 query = unicode(query)
78 query_parser = QueryParser("description", schema=self.ix.schema)
79 myquery = query_parser.parse(query)
80
81 # Old code: too strict
82 # extendedquery = Or([myquery] +
83 # [Term(field, query) for field in self.keywords])
84
85
86 # New code: too permissive
87 # extendedquery = [myquery]
88 excluded = set(['AND', 'OR', 'NOT'])
89 terms = [i for i in query.split() if i not in excluded]
90 # for field in self.keywords:
91 # extendedquery.extend([Term(field, term) for term in terms])
92 # extendedquery = Or(extendedquery)
93
94 # Code should look something like
95 #Or([myquery] + [Or(
96 # extendedquery = [myquery]
97 extendedquery = And([Or([myquery] + [Term('description', term), Term('name', term)] +
98 [Term(field, term) for field in self.keywords]) for term in terms])
99
100 # perform the search
101 searcher = self.ix.searcher()
102 return [i['name'] for i in searcher.search(extendedquery, limit=None)]
103
104 def __del__(self):
105 if self.tempdir:
106 # delete the temporary directory, if present
107 shutil.rmtree(self.index)