Mercurial > hg > toolbox
annotate toolbox/search.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 11 May 2014 09:15:35 -0700 |
parents | |
children | cabe97535057 |
rev | line source |
---|---|
0
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 import os |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 import shutil |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 import tempfile |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 from time import sleep |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 from whoosh import fields |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 from whoosh import index |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 from whoosh.query import And |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 from whoosh.query import Or |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
10 from whoosh.query import Term |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 from whoosh.qparser import QueryParser |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 from whoosh.index import LockError |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 class WhooshSearch(object): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 """full-text search""" |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 def __init__(self, whoosh_index=None): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 """ |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
19 - whoosh_index : whoosh index directory |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 """ |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 self.schema = fields.Schema(name=fields.ID(unique=True, stored=True), |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 description=fields.TEXT) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 self.keywords = set([]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
24 self.tempdir = False |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
25 if whoosh_index is None: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
26 whoosh_index = tempfile.mkdtemp() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 self.tempdir = True |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 if not os.path.exists(whoosh_index): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 os.makedirs(whoosh_index) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 self.index = whoosh_index |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 self.ix = index.create_in(self.index, self.schema) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 def update(self, name, description, **kw): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 """update a document""" |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 # forgivingly get the writer |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 timeout = 3. # seconds |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 ctr = 0. |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 incr = 0.2 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
40 while ctr < timeout: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 try: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 writer = self.ix.writer() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 break |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 except LockError: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
45 ctr += incr |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 sleep(incr) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 else: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
48 raise |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
49 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
50 # add keywords |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
51 for key in kw: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
52 if key not in self.keywords: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
53 writer.add_field(key, fields.KEYWORD) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
54 self.keywords.add(key) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
55 if not isinstance(kw[key], basestring): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
56 kw[key] = ' '.join(kw[key]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
57 kw[key] = unicode(kw[key]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
58 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
59 # convert to unicode for whoosh |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
60 # really whoosh should do this for us |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
61 # and really python should be unicode-based :( |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
62 name = unicode(name) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
63 description = unicode(description) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
64 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
65 writer.update_document(name=name, description=description, **kw) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
66 writer.commit() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
67 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
68 def delete(self, name): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
69 """delete a document of a given name""" |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
70 writer = self.ix.writer() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
71 name = unicode(name) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
72 writer.delete_by_term('name', name) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
73 writer.commit() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
74 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
75 def __call__(self, query): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
76 """search""" |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
77 query = unicode(query) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
78 query_parser = QueryParser("description", schema=self.ix.schema) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
79 myquery = query_parser.parse(query) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
80 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
81 # Old code: too strict |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
82 # extendedquery = Or([myquery] + |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
83 # [Term(field, query) for field in self.keywords]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
84 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
85 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 # New code: too permissive |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
87 # extendedquery = [myquery] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
88 excluded = set(['AND', 'OR', 'NOT']) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
89 terms = [i for i in query.split() if i not in excluded] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
90 # for field in self.keywords: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
91 # extendedquery.extend([Term(field, term) for term in terms]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
92 # extendedquery = Or(extendedquery) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
93 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
94 # Code should look something like |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
95 #Or([myquery] + [Or( |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
96 # extendedquery = [myquery] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
97 extendedquery = And([Or([myquery] + [Term('description', term), Term('name', term)] + |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
98 [Term(field, term) for field in self.keywords]) for term in terms]) |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
99 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
100 # perform the search |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
101 searcher = self.ix.searcher() |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
102 return [i['name'] for i in searcher.search(extendedquery, limit=None)] |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
103 |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
104 def __del__(self): |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
105 if self.tempdir: |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
106 # delete the temporary directory, if present |
b0942f44413f
import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
107 shutil.rmtree(self.index) |