Mercurial > hg > toolbox
comparison toolbox/search.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 11 May 2014 09:15:35 -0700 |
parents | |
children | cabe97535057 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b0942f44413f |
---|---|
1 import os | |
2 import shutil | |
3 import tempfile | |
4 | |
5 from time import sleep | |
6 from whoosh import fields | |
7 from whoosh import index | |
8 from whoosh.query import And | |
9 from whoosh.query import Or | |
10 from whoosh.query import Term | |
11 from whoosh.qparser import QueryParser | |
12 from whoosh.index import LockError | |
13 | |
14 class WhooshSearch(object): | |
15 """full-text search""" | |
16 | |
17 def __init__(self, whoosh_index=None): | |
18 """ | |
19 - whoosh_index : whoosh index directory | |
20 """ | |
21 self.schema = fields.Schema(name=fields.ID(unique=True, stored=True), | |
22 description=fields.TEXT) | |
23 self.keywords = set([]) | |
24 self.tempdir = False | |
25 if whoosh_index is None: | |
26 whoosh_index = tempfile.mkdtemp() | |
27 self.tempdir = True | |
28 if not os.path.exists(whoosh_index): | |
29 os.makedirs(whoosh_index) | |
30 self.index = whoosh_index | |
31 self.ix = index.create_in(self.index, self.schema) | |
32 | |
33 def update(self, name, description, **kw): | |
34 """update a document""" | |
35 | |
36 # forgivingly get the writer | |
37 timeout = 3. # seconds | |
38 ctr = 0. | |
39 incr = 0.2 | |
40 while ctr < timeout: | |
41 try: | |
42 writer = self.ix.writer() | |
43 break | |
44 except LockError: | |
45 ctr += incr | |
46 sleep(incr) | |
47 else: | |
48 raise | |
49 | |
50 # add keywords | |
51 for key in kw: | |
52 if key not in self.keywords: | |
53 writer.add_field(key, fields.KEYWORD) | |
54 self.keywords.add(key) | |
55 if not isinstance(kw[key], basestring): | |
56 kw[key] = ' '.join(kw[key]) | |
57 kw[key] = unicode(kw[key]) | |
58 | |
59 # convert to unicode for whoosh | |
60 # really whoosh should do this for us | |
61 # and really python should be unicode-based :( | |
62 name = unicode(name) | |
63 description = unicode(description) | |
64 | |
65 writer.update_document(name=name, description=description, **kw) | |
66 writer.commit() | |
67 | |
68 def delete(self, name): | |
69 """delete a document of a given name""" | |
70 writer = self.ix.writer() | |
71 name = unicode(name) | |
72 writer.delete_by_term('name', name) | |
73 writer.commit() | |
74 | |
75 def __call__(self, query): | |
76 """search""" | |
77 query = unicode(query) | |
78 query_parser = QueryParser("description", schema=self.ix.schema) | |
79 myquery = query_parser.parse(query) | |
80 | |
81 # Old code: too strict | |
82 # extendedquery = Or([myquery] + | |
83 # [Term(field, query) for field in self.keywords]) | |
84 | |
85 | |
86 # New code: too permissive | |
87 # extendedquery = [myquery] | |
88 excluded = set(['AND', 'OR', 'NOT']) | |
89 terms = [i for i in query.split() if i not in excluded] | |
90 # for field in self.keywords: | |
91 # extendedquery.extend([Term(field, term) for term in terms]) | |
92 # extendedquery = Or(extendedquery) | |
93 | |
94 # Code should look something like | |
95 #Or([myquery] + [Or( | |
96 # extendedquery = [myquery] | |
97 extendedquery = And([Or([myquery] + [Term('description', term), Term('name', term)] + | |
98 [Term(field, term) for field in self.keywords]) for term in terms]) | |
99 | |
100 # perform the search | |
101 searcher = self.ix.searcher() | |
102 return [i['name'] for i in searcher.search(extendedquery, limit=None)] | |
103 | |
104 def __del__(self): | |
105 if self.tempdir: | |
106 # delete the temporary directory, if present | |
107 shutil.rmtree(self.index) |