Mercurial > hg > toolbox
diff toolbox/search.py @ 0:b0942f44413f
import from git://github.com/mozilla/toolbox.git
author | Jeff Hammel <k0scist@gmail.com> |
---|---|
date | Sun, 11 May 2014 09:15:35 -0700 |
parents | |
children | cabe97535057 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolbox/search.py Sun May 11 09:15:35 2014 -0700 @@ -0,0 +1,107 @@ +import os +import shutil +import tempfile + +from time import sleep +from whoosh import fields +from whoosh import index +from whoosh.query import And +from whoosh.query import Or +from whoosh.query import Term +from whoosh.qparser import QueryParser +from whoosh.index import LockError + +class WhooshSearch(object): + """full-text search""" + + def __init__(self, whoosh_index=None): + """ + - whoosh_index : whoosh index directory + """ + self.schema = fields.Schema(name=fields.ID(unique=True, stored=True), + description=fields.TEXT) + self.keywords = set([]) + self.tempdir = False + if whoosh_index is None: + whoosh_index = tempfile.mkdtemp() + self.tempdir = True + if not os.path.exists(whoosh_index): + os.makedirs(whoosh_index) + self.index = whoosh_index + self.ix = index.create_in(self.index, self.schema) + + def update(self, name, description, **kw): + """update a document""" + + # forgivingly get the writer + timeout = 3. # seconds + ctr = 0. + incr = 0.2 + while ctr < timeout: + try: + writer = self.ix.writer() + break + except LockError: + ctr += incr + sleep(incr) + else: + raise + + # add keywords + for key in kw: + if key not in self.keywords: + writer.add_field(key, fields.KEYWORD) + self.keywords.add(key) + if not isinstance(kw[key], basestring): + kw[key] = ' '.join(kw[key]) + kw[key] = unicode(kw[key]) + + # convert to unicode for whoosh + # really whoosh should do this for us + # and really python should be unicode-based :( + name = unicode(name) + description = unicode(description) + + writer.update_document(name=name, description=description, **kw) + writer.commit() + + def delete(self, name): + """delete a document of a given name""" + writer = self.ix.writer() + name = unicode(name) + writer.delete_by_term('name', name) + writer.commit() + + def __call__(self, query): + """search""" + query = unicode(query) + query_parser = QueryParser("description", schema=self.ix.schema) + myquery = query_parser.parse(query) + +# Old code: too strict +# extendedquery = Or([myquery] + +# [Term(field, query) for field in self.keywords]) + + + # New code: too permissive +# extendedquery = [myquery] + excluded = set(['AND', 'OR', 'NOT']) + terms = [i for i in query.split() if i not in excluded] +# for field in self.keywords: +# extendedquery.extend([Term(field, term) for term in terms]) +# extendedquery = Or(extendedquery) + + # Code should look something like + #Or([myquery] + [Or( + # extendedquery = [myquery] + extendedquery = And([Or([myquery] + [Term('description', term), Term('name', term)] + + [Term(field, term) for field in self.keywords]) for term in terms]) + + # perform the search + searcher = self.ix.searcher() + return [i['name'] for i in searcher.search(extendedquery, limit=None)] + + def __del__(self): + if self.tempdir: + # delete the temporary directory, if present + shutil.rmtree(self.index)