annotate toolbox/search.py @ 0:b0942f44413f

import from git://github.com/mozilla/toolbox.git
author Jeff Hammel <k0scist@gmail.com>
date Sun, 11 May 2014 09:15:35 -0700
parents
children cabe97535057
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
1 import os
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
2 import shutil
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
3 import tempfile
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
4
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
5 from time import sleep
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
6 from whoosh import fields
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
7 from whoosh import index
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
8 from whoosh.query import And
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
9 from whoosh.query import Or
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
10 from whoosh.query import Term
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
11 from whoosh.qparser import QueryParser
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
12 from whoosh.index import LockError
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
13
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
14 class WhooshSearch(object):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
15 """full-text search"""
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
16
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
17 def __init__(self, whoosh_index=None):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
18 """
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
19 - whoosh_index : whoosh index directory
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
20 """
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
21 self.schema = fields.Schema(name=fields.ID(unique=True, stored=True),
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
22 description=fields.TEXT)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
23 self.keywords = set([])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
24 self.tempdir = False
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
25 if whoosh_index is None:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
26 whoosh_index = tempfile.mkdtemp()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
27 self.tempdir = True
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
28 if not os.path.exists(whoosh_index):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
29 os.makedirs(whoosh_index)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
30 self.index = whoosh_index
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
31 self.ix = index.create_in(self.index, self.schema)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
32
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
33 def update(self, name, description, **kw):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
34 """update a document"""
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
35
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
36 # forgivingly get the writer
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
37 timeout = 3. # seconds
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
38 ctr = 0.
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
39 incr = 0.2
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
40 while ctr < timeout:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
41 try:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
42 writer = self.ix.writer()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
43 break
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
44 except LockError:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
45 ctr += incr
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
46 sleep(incr)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
47 else:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
48 raise
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
49
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
50 # add keywords
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
51 for key in kw:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
52 if key not in self.keywords:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
53 writer.add_field(key, fields.KEYWORD)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
54 self.keywords.add(key)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
55 if not isinstance(kw[key], basestring):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
56 kw[key] = ' '.join(kw[key])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
57 kw[key] = unicode(kw[key])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
58
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
59 # convert to unicode for whoosh
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
60 # really whoosh should do this for us
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
61 # and really python should be unicode-based :(
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
62 name = unicode(name)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
63 description = unicode(description)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
64
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
65 writer.update_document(name=name, description=description, **kw)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
66 writer.commit()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
67
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
68 def delete(self, name):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
69 """delete a document of a given name"""
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
70 writer = self.ix.writer()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
71 name = unicode(name)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
72 writer.delete_by_term('name', name)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
73 writer.commit()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
74
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
75 def __call__(self, query):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
76 """search"""
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
77 query = unicode(query)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
78 query_parser = QueryParser("description", schema=self.ix.schema)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
79 myquery = query_parser.parse(query)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
80
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
81 # Old code: too strict
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
82 # extendedquery = Or([myquery] +
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
83 # [Term(field, query) for field in self.keywords])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
84
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
85
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
86 # New code: too permissive
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
87 # extendedquery = [myquery]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
88 excluded = set(['AND', 'OR', 'NOT'])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
89 terms = [i for i in query.split() if i not in excluded]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
90 # for field in self.keywords:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
91 # extendedquery.extend([Term(field, term) for term in terms])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
92 # extendedquery = Or(extendedquery)
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
93
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
94 # Code should look something like
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
95 #Or([myquery] + [Or(
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
96 # extendedquery = [myquery]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
97 extendedquery = And([Or([myquery] + [Term('description', term), Term('name', term)] +
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
98 [Term(field, term) for field in self.keywords]) for term in terms])
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
99
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
100 # perform the search
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
101 searcher = self.ix.searcher()
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
102 return [i['name'] for i in searcher.search(extendedquery, limit=None)]
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
103
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
104 def __del__(self):
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
105 if self.tempdir:
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
106 # delete the temporary directory, if present
b0942f44413f import from git://github.com/mozilla/toolbox.git
Jeff Hammel <k0scist@gmail.com>
parents:
diff changeset
107 shutil.rmtree(self.index)