Merge branch 'pretty' of github.com:fnp/wolnelektury into pretty
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, timeout=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
176         if timeout:
177             conf.setWriteLockTimeout(long(timeout))
178         self.index = IndexWriter(self.store, conf)
179         return self.index
180
181     def optimize(self):
182         self.index.optimize()
183
184     def close(self):
185         try:
186             self.index.optimize()
187         except JavaError, je:
188             print "Error during optimize phase, check index: %s" % je
189
190         self.index.close()
191         self.index = None
192
193     def __enter__(self):
194         self.open()
195         return self
196
197     def __exit__(self, type, value, tb):
198         self.close()
199
200
201 class Index(BaseIndex):
202     """
203     Class indexing books.
204     """
205     def __init__(self, analyzer=None):
206         super(Index, self).__init__(analyzer)
207
208     def index_tags(self):
209         """
210         Re-index global tag list.
211         Removes all tags from index, then index them again.
212         Indexed fields include: id, name (with and without polish stems), category
213         """
214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215         self.index.deleteDocuments(q)
216
217         for tag in catalogue.models.Tag.objects.all():
218             doc = Document()
219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223             self.index.addDocument(doc)
224
225         for pdtag in PDCounterAuthor.objects.all():
226             doc = Document()
227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232             self.index.addDocument(doc)
233
234         for pdtag in PDCounterBook.objects.all():
235             doc = Document()
236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
237             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
238             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
240             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
241             self.index.addDocument(doc)
242
243     def create_book_doc(self, book):
244         """
245         Create a lucene document referring book id.
246         """
247         doc = Document()
248         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
249         if book.parent is not None:
250             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
251         return doc
252
253     def remove_book(self, book):
254         """Removes a book from search index.
255         book - Book instance."""
256         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
257         self.index.deleteDocuments(q)
258
259     def index_book(self, book, book_info=None, overwrite=True):
260         """
261         Indexes the book.
262         Creates a lucene document for extracted metadata
263         and calls self.index_content() to index the contents of the book.
264         """
265         if overwrite:
266             self.remove_book(book)
267
268         book_doc = self.create_book_doc(book)
269         meta_fields = self.extract_metadata(book, book_info)
270         for f in meta_fields.values():
271             if isinstance(f, list) or isinstance(f, tuple):
272                 for elem in f:
273                     book_doc.add(elem)
274             else:
275                 book_doc.add(f)
276         self.index.addDocument(book_doc)
277         del book_doc
278
279         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
280
281     master_tags = [
282         'opowiadanie',
283         'powiesc',
284         'dramat_wierszowany_l',
285         'dramat_wierszowany_lp',
286         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
287         'wywiad',
288         ]
289
290     ignore_content_tags = [
291         'uwaga', 'extra',
292         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
293         'didaskalia',
294         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
295         ]
296
297     footnote_tags = ['pa', 'pt', 'pr', 'pe']
298
299     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
300
301     published_date_re = re.compile("([0-9]+)[\]. ]*$")
302
303     def extract_metadata(self, book, book_info=None):
304         """
305         Extract metadata from book and returns a map of fields keyed by fieldname
306         """
307         fields = {}
308
309         if book_info is None:
310             book_info = dcparser.parse(open(book.xml_file.path))
311
312         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
313         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
314         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
315
316         # validator, name
317         for field in dcparser.BookInfo.FIELDS:
318             if hasattr(book_info, field.name):
319                 if not getattr(book_info, field.name):
320                     continue
321                 # since no type information is available, we use validator
322                 type_indicator = field.validator
323                 if type_indicator == dcparser.as_unicode:
324                     s = getattr(book_info, field.name)
325                     if field.multiple:
326                         s = ', '.join(s)
327                     try:
328                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
329                     except JavaError as je:
330                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
331                 elif type_indicator == dcparser.as_person:
332                     p = getattr(book_info, field.name)
333                     if isinstance(p, dcparser.Person):
334                         persons = unicode(p)
335                     else:
336                         persons = ', '.join(map(unicode, p))
337                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
338                 elif type_indicator == dcparser.as_date:
339                     dt = getattr(book_info, field.name)
340                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
341                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
342
343         # get published date
344         pd = None
345         if hasattr(book_info, 'source_name') and book_info.source_name:
346             match = self.published_date_re.search(book_info.source_name)
347             if match is not None:
348                 pd = str(match.groups()[0])
349         if not pd: pd = ""
350         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
351
352         return fields
353
354     def add_gaps(self, fields, fieldname):
355         """
356         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
357         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
358         """
359         def gap():
360             while True:
361                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
362         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
363
364     def get_master(self, root):
365         """
366         Returns the first master tag from an etree.
367         """
368         for master in root.iter():
369             if master.tag in self.master_tags:
370                 return master
371
372     def index_content(self, book, book_fields=[]):
373         """
374         Walks the book XML and extract content from it.
375         Adds parts for each header tag and for each fragment.
376         """
377         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
378         root = wld.edoc.getroot()
379
380         master = self.get_master(root)
381         if master is None:
382             return []
383
384         def walker(node, ignore_tags=[]):
385
386             if node.tag not in ignore_tags:
387                 yield node, None, None
388                 if node.text is not None:
389                     yield None, node.text, None
390                 for child in list(node):
391                     for b, t, e in walker(child):
392                         yield b, t, e
393                 yield None, None, node
394
395             if node.tail is not None:
396                 yield None, node.tail, None
397             return
398
399         def fix_format(text):
400             #            separator = [u" ", u"\t", u".", u";", u","]
401             if isinstance(text, list):
402                 # need to join it first
403                 text = filter(lambda s: s is not None, content)
404                 text = u' '.join(text)
405                 # for i in range(len(text)):
406                 #     if i > 0:
407                 #         if text[i][0] not in separator\
408                 #             and text[i - 1][-1] not in separator:
409                 #          text.insert(i, u" ")
410
411             return re.sub("(?m)/$", "", text)
412
413         def add_part(snippets, **fields):
414             doc = self.create_book_doc(book)
415             for f in book_fields:
416                 doc.add(f)
417
418             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
419             doc.add(NumericField("header_span", Field.Store.YES, True)\
420                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
421             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
422
423             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
424                           Field.TermVector.WITH_POSITIONS_OFFSETS))
425
426             snip_pos = snippets.add(fields["content"])
427             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
428             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
429
430             if 'fragment_anchor' in fields:
431                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
432                               Field.Store.YES, Field.Index.NOT_ANALYZED))
433
434             if 'themes' in fields:
435                 themes, themes_pl = zip(*[
436                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
437                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
438                      for theme in fields['themes']])
439
440                 themes = self.add_gaps(themes, 'themes')
441                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
442
443                 for t in themes:
444                     doc.add(t)
445                 for t in themes_pl:
446                     doc.add(t)
447
448             return doc
449
450         def give_me_utf8(s):
451             if isinstance(s, unicode):
452                 return s.encode('utf-8')
453             else:
454                 return s
455
456         fragments = {}
457         snippets = Snippets(book.id).open('w')
458         try:
459             for header, position in zip(list(master), range(len(master))):
460
461                 if header.tag in self.skip_header_tags:
462                     continue
463                 if header.tag is etree.Comment:
464                     continue
465
466                 # section content
467                 content = []
468                 footnote = []
469
470                 def all_content(text):
471                     for frag in fragments.values():
472                         frag['content'].append(text)
473                     content.append(text)
474                 handle_text = [all_content]
475
476
477                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
478                     # handle footnotes
479                     if start is not None and start.tag in self.footnote_tags:
480                         footnote = []
481                         def collect_footnote(t):
482                             footnote.append(t)
483                         handle_text.append(collect_footnote)
484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
485                         handle_text.pop()
486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
487                                        content=u''.join(footnote),
488                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
489                 
490                         self.index.addDocument(doc)
491                         #print "@ footnote text: %s" % footnote
492                         footnote = []
493                     
494                     # handle fragments and themes.
495                     if start is not None and start.tag == 'begin':
496                         fid = start.attrib['id'][1:]
497                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
498
499                     # themes for this fragment
500                     elif start is not None and start.tag == 'motyw':
501                         fid = start.attrib['id'][1:]
502                         handle_text.append(None)
503                         if start.text is not None:
504                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
505                     elif end is not None and end.tag == 'motyw':
506                         handle_text.pop()
507
508                     elif start is not None and start.tag == 'end':
509                         fid = start.attrib['id'][1:]
510                         if fid not in fragments:
511                             continue  # a broken <end> node, skip it
512                         frag = fragments[fid]
513                         if frag['themes'] == []:
514                             continue  # empty themes list.
515                         del fragments[fid]
516
517                         doc = add_part(snippets,
518                                        header_type=frag['start_header'],
519                                        header_index=frag['start_section'],
520                                        header_span=position - frag['start_section'] + 1,
521                                        fragment_anchor=fid,
522                                        content=fix_format(frag['content']),
523                                        themes=frag['themes'])
524                         #print '@ FRAG %s' % frag['content']
525                         self.index.addDocument(doc)
526
527                         # Collect content.
528
529                     if text is not None and handle_text is not []:
530                         hdl = handle_text[-1]
531                         if hdl is not None:
532                             hdl(text)
533
534                         # in the end, add a section text.
535                 doc = add_part(snippets, header_index=position, header_type=header.tag,
536                                content=fix_format(content))
537                 #print '@ CONTENT: %s' % fix_format(content)
538
539                 self.index.addDocument(doc)
540
541         finally:
542             snippets.close()
543
544
545 def log_exception_wrapper(f):
546     def _wrap(*a):
547         try:
548             f(*a)
549         except Exception, e:
550             print("Error in indexing thread: %s" % e)
551             traceback.print_exc()
552             raise e
553     return _wrap
554
555
556 class ReusableIndex(Index):
557     """
558     Works like index, but does not close/optimize Lucene index
559     until program exit (uses atexit hook).
560     This is usefull for importbooks command.
561
562     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
563     """
564     index = None
565
566     def open(self, analyzer=None, **kw):
567         if ReusableIndex.index:
568             self.index = ReusableIndex.index
569         else:
570             print("opening index")
571             Index.open(self, analyzer, **kw)
572             ReusableIndex.index = self.index
573             atexit.register(ReusableIndex.close_reusable)
574
575     # def index_book(self, *args, **kw):
576     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
577     #     ReusableIndex.pool_jobs.append(job)
578
579     @staticmethod
580     def close_reusable():
581         if ReusableIndex.index:
582             print("closing index")
583             ReusableIndex.index.optimize()
584             ReusableIndex.index.close()
585             ReusableIndex.index = None
586
587     def close(self):
588         if ReusableIndex.index:
589             ReusableIndex.index.commit()
590
591
592 class JoinSearch(object):
593     """
594     This mixin could be used to handle block join queries.
595     (currently unused)
596     """
597     def __init__(self, *args, **kw):
598         super(JoinSearch, self).__init__(*args, **kw)
599
600     def wrapjoins(self, query, fields=[]):
601         """
602         This functions modifies the query in a recursive way,
603         so Term and Phrase Queries contained, which match
604         provided fields are wrapped in a BlockJoinQuery,
605         and so delegated to children documents.
606         """
607         if BooleanQuery.instance_(query):
608             qs = BooleanQuery.cast_(query)
609             for clause in qs:
610                 clause = BooleanClause.cast_(clause)
611                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
612             return qs
613         else:
614             termset = HashSet()
615             query.extractTerms(termset)
616             for t in termset:
617                 t = Term.cast_(t)
618                 if t.field() not in fields:
619                     return query
620             return BlockJoinQuery(query, self.parent_filter,
621                                   BlockJoinQuery.ScoreMode.Total)
622
623     def bsearch(self, query, max_results=50):
624         q = self.query(query)
625         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
626
627         tops = self.searcher.search(bjq, max_results)
628         bks = []
629         for found in tops.scoreDocs:
630             doc = self.searcher.doc(found.doc)
631             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
632         return (bks, tops.totalHits)
633
634
635 class SearchResult(object):
636     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
637         if tokens_cache is None: tokens_cache = {}
638
639         if score:
640             self._score = score
641         else:
642             self._score = scoreDocs.score
643
644         self.boost = 1.0
645
646         self._hits = []
647         self._processed_hits = None  # processed hits
648
649         stored = search.searcher.doc(scoreDocs.doc)
650         self.book_id = int(stored.get("book_id"))
651
652         pd = stored.get("published_date")
653         if pd is None:
654             pd = 0
655         self.published_date = int(pd)
656
657         header_type = stored.get("header_type")
658         # we have a content hit in some header of fragment
659         if header_type is not None:
660             sec = (header_type, int(stored.get("header_index")))
661             header_span = stored.get('header_span')
662             header_span = header_span is not None and int(header_span) or 1
663
664             fragment = stored.get("fragment_anchor")
665
666             if snippets:
667                 snippets = snippets.replace("/\n", "\n")
668             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
669
670             self._hits.append(hit)
671
672         self.search = search
673         self.searched = searched
674         self.tokens_cache = tokens_cache
675
676     @property
677     def score(self):
678         return self._score * self.boost
679
680     def merge(self, other):
681         if self.book_id != other.book_id:
682             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
683         self._hits += other._hits
684         if other.score > self.score:
685             self._score = other._score
686         return self
687
688     def get_book(self):
689         return catalogue.models.Book.objects.get(id=self.book_id)
690
691     book = property(get_book)
692
693     @property
694     def hits(self):
695         if self._processed_hits is not None:
696             return self._processed_hits
697
698         POSITION = 0
699         FRAGMENT = 1
700         POSITION_INDEX = 1
701         POSITION_SPAN = 2
702         SCORE = 2
703         OTHER = 3
704
705         # to sections and fragments
706         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
707         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
708         sect = filter(lambda s: 0 == len(filter(
709             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
710             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
711             frags)), sect)
712
713         hits = []
714
715         # remove duplicate fragments
716         fragments = {}
717         for f in frags:
718             fid = f[FRAGMENT]
719             if fid in fragments:
720                 if fragments[fid][SCORE] >= f[SCORE]:
721                     continue
722             fragments[fid] = f
723         frags = fragments.values()
724
725         # remove duplicate sections
726         sections = {}
727
728         for s in sect:
729             si = s[POSITION][POSITION_INDEX]
730             # skip existing
731             if si in sections:
732                 if sections[si]['score'] >= s[SCORE]:
733                     continue
734
735             m = {'score': s[SCORE],
736                  'section_number': s[POSITION][POSITION_INDEX] + 1,
737                  }
738             m.update(s[OTHER])
739             sections[si] = m
740
741         hits = sections.values()
742
743         for f in frags:
744             try:
745                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
746             except catalogue.models.Fragment.DoesNotExist:
747                 # stale index
748                 continue
749
750             # Figure out if we were searching for a token matching some word in theme name.
751             themes = frag.tags.filter(category='theme')
752             themes_hit = []
753             if self.searched is not None:
754                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
755                 for theme in themes:
756                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
757                     for t in tokens:
758                         if t in name_tokens:
759                             if not theme in themes_hit:
760                                 themes_hit.append(theme)
761                             break
762
763             m = {'score': f[SCORE],
764                  'fragment': frag,
765                  'section_number': f[POSITION][POSITION_INDEX] + 1,
766                  'themes': themes,
767                  'themes_hit': themes_hit
768                  }
769             m.update(f[OTHER])
770             hits.append(m)
771
772         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
773
774         self._processed_hits = hits
775
776         return hits
777
778     def __unicode__(self):
779         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
780
781     @staticmethod
782     def aggregate(*result_lists):
783         books = {}
784         for rl in result_lists:
785             for r in rl:
786                 if r.book_id in books:
787                     books[r.book_id].merge(r)
788                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
789                 else:
790                     books[r.book_id] = r
791         return books.values()
792
793     def __cmp__(self, other):
794         c = cmp(self.score, other.score)
795         if c == 0:
796             # this is inverted, because earlier date is better
797             return cmp(other.published_date, self.published_date)
798         else:
799             return c
800
801
802 class Hint(object):
803     """
804     Given some hint information (information we already know about)
805     our search target - like author, title (specific book), epoch, genre, kind
806     we can narrow down search using filters.
807     """
808     def __init__(self, search):
809         """
810         Accepts a Searcher instance.
811         """
812         self.search = search
813         self.book_tags = {}
814         self.part_tags = []
815         self._books = []
816
817     def books(self, *books):
818         """
819         Give a hint that we search these books.
820         """
821         self._books = books
822
823     def tags(self, tags):
824         """
825         Give a hint that these Tag objects (a list of)
826         is necessary.
827         """
828         for t in tags:
829             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
830                 lst = self.book_tags.get(t.category, [])
831                 lst.append(t)
832                 self.book_tags[t.category] = lst
833             if t.category in ['theme', 'theme_pl']:
834                 self.part_tags.append(t)
835
836     def tag_filter(self, tags, field='tags'):
837         """
838         Given a lsit of tags and an optional field (but they are normally in tags field)
839         returns a filter accepting only books with specific tags.
840         """
841         q = BooleanQuery()
842
843         for tag in tags:
844             toks = self.search.get_tokens(tag.name, field=field)
845             tag_phrase = PhraseQuery()
846             for tok in toks:
847                 tag_phrase.add(Term(field, tok))
848             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
849
850         return QueryWrapperFilter(q)
851
852     def book_filter(self):
853         """
854         Filters using book tags (all tag kinds except a theme)
855         """
856         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
857         if tags:
858             return self.tag_filter(tags)
859         else:
860             return None
861
862     def part_filter(self):
863         """
864         This filter can be used to look for book parts.
865         It filters on book id and/or themes.
866         """
867         fs = []
868         if self.part_tags:
869             fs.append(self.tag_filter(self.part_tags, field='themes'))
870
871         if self._books != []:
872             bf = BooleanFilter()
873             for b in self._books:
874                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
875                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
876             fs.append(bf)
877
878         return Search.chain_filters(fs)
879
880     def should_search_for_book(self):
881         return self._books == []
882
883     def just_search_in(self, all):
884         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
885         some = []
886         for field in all:
887             if field == 'authors' and 'author' in self.book_tags:
888                 continue
889             if field == 'title' and self._books != []:
890                 continue
891             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
892                 continue
893             some.append(field)
894         return some
895
896
897 class Search(IndexStore):
898     """
899     Search facilities.
900     """
901     def __init__(self, default_field="content"):
902         IndexStore.__init__(self)
903         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
904         # self.analyzer = WLAnalyzer()
905         self.searcher = IndexSearcher(self.store, True)
906         self.parser = QueryParser(Version.LUCENE_34, default_field,
907                                   self.analyzer)
908
909         self.parent_filter = TermsFilter()
910         self.parent_filter.addTerm(Term("is_book", "true"))
911
912     def query(self, query):
913         """Parse query in default Lucene Syntax. (for humans)
914         """
915         return self.parser.parse(query)
916
917     def simple_search(self, query, max_results=50):
918         """Runs a query for books using lucene syntax. (for humans)
919         Returns (books, total_hits)
920         """
921
922         tops = self.searcher.search(self.query(query), max_results)
923         bks = []
924         for found in tops.scoreDocs:
925             doc = self.searcher.doc(found.doc)
926             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
927         return (bks, tops.totalHits)
928
929     def get_tokens(self, searched, field='content', cached=None):
930         """returns tokens analyzed by a proper (for a field) analyzer
931         argument can be: StringReader, string/unicode, or tokens. In the last case
932         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
933         """
934         if cached is not None and field in cached:
935             return cached[field]
936
937         if isinstance(searched, str) or isinstance(searched, unicode):
938             searched = StringReader(searched)
939         elif isinstance(searched, list):
940             return searched
941
942         searched.reset()
943         tokens = self.analyzer.reusableTokenStream(field, searched)
944         toks = []
945         while tokens.incrementToken():
946             cta = tokens.getAttribute(CharTermAttribute.class_)
947             toks.append(cta.toString())
948
949         if cached is not None:
950             cached[field] = toks
951
952         return toks
953
954     def fuzziness(self, fuzzy):
955         """Helper method to sanitize fuzziness"""
956         if not fuzzy:
957             return None
958         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
959             return fuzzy
960         else:
961             return 0.5
962
963     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
964         """
965         Return a PhraseQuery with a series of tokens.
966         """
967         if fuzzy:
968             phrase = MultiPhraseQuery()
969             for t in tokens:
970                 term = Term(field, t)
971                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
972                 fuzzterms = []
973
974                 while True:
975                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
976                     ft = fuzzterm.term()
977                     if ft:
978                         fuzzterms.append(ft)
979                     if not fuzzterm.next(): break
980                 if fuzzterms:
981                     phrase.add(JArray('object')(fuzzterms, Term))
982                 else:
983                     phrase.add(term)
984         else:
985             phrase = PhraseQuery()
986             phrase.setSlop(slop)
987             for t in tokens:
988                 term = Term(field, t)
989                 phrase.add(term)
990         return phrase
991
992     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
993         """
994         Returns term queries joined by boolean query.
995         modal - applies to boolean query
996         fuzzy - should the query by fuzzy.
997         """
998         q = BooleanQuery()
999         for t in tokens:
1000             term = Term(field, t)
1001             if fuzzy:
1002                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1003             else:
1004                 term = TermQuery(term)
1005             q.add(BooleanClause(term, modal))
1006         return q
1007
1008     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1009                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1010         if filters is None: filters = []
1011         if tokens_cache is None: tokens_cache = {}
1012
1013         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1014
1015         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1016         if book:
1017             filters.append(self.term_filter(Term('is_book', 'true')))
1018         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1019
1020         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1021
1022     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1023                     filters=None, tokens_cache=None, boost=None, snippets=True):
1024         if filters is None: filters = []
1025         if tokens_cache is None: tokens_cache = {}
1026
1027         if book:
1028             filters.append(self.term_filter(Term('is_book', 'true')))
1029
1030         query = BooleanQuery()
1031
1032         for fld in fields:
1033             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1034
1035             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1036                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1037
1038         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1039
1040         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1041                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1042
1043     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1044         """
1045         Search for perfect book matches. Just see if the query matches with some author or title,
1046         taking hints into account.
1047         """
1048         fields_to_search = ['authors', 'title']
1049         only_in = None
1050         if hint:
1051             if not hint.should_search_for_book():
1052                 return []
1053             fields_to_search = hint.just_search_in(fields_to_search)
1054             only_in = hint.book_filter()
1055
1056         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1057
1058         books = []
1059         for q in qrys:
1060             top = self.searcher.search(q,
1061                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1062                 max_results)
1063             for found in top.scoreDocs:
1064                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1065         return books
1066
1067     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1068         fields_to_search = ['tags', 'authors', 'title']
1069
1070         only_in = None
1071         if hint:
1072             if not hint.should_search_for_book():
1073                 return []
1074             fields_to_search = hint.just_search_in(fields_to_search)
1075             only_in = hint.book_filter()
1076
1077         tokens = self.get_tokens(searched, field='SIMPLE')
1078
1079         q = BooleanQuery()
1080
1081         for fld in fields_to_search:
1082             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1083                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1084
1085         books = []
1086         top = self.searcher.search(q,
1087                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1088             max_results)
1089         for found in top.scoreDocs:
1090             books.append(SearchResult(self, found, how_found="search_book"))
1091
1092         return books
1093
1094     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1095         """
1096         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1097         some part/fragment of the book.
1098         """
1099         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1100
1101         flt = None
1102         if hint:
1103             flt = hint.part_filter()
1104
1105         books = []
1106         for q in qrys:
1107             top = self.searcher.search(q,
1108                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1109                                                            flt]),
1110                                        max_results)
1111             for found in top.scoreDocs:
1112                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1113
1114         return books
1115
1116     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1117         """
1118         Tries to use search terms to match different fields of book (or its parts).
1119         E.g. one word can be an author survey, another be a part of the title, and the rest
1120         are some words from third chapter.
1121         """
1122         if tokens_cache is None: tokens_cache = {}
1123         books = []
1124         only_in = None
1125
1126         if hint:
1127             only_in = hint.part_filter()
1128
1129         # content only query : themes x content
1130         q = BooleanQuery()
1131
1132         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1133         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1134
1135         # only search in themes when we do not already filter by themes
1136         if hint is None or hint.just_search_in(['themes']) != []:
1137             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1138                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1139
1140         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1141                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1142
1143         topDocs = self.searcher.search(q, only_in, max_results)
1144         for found in topDocs.scoreDocs:
1145             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1146             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1147
1148         # query themes/content x author/title/tags
1149         q = BooleanQuery()
1150         in_content = BooleanQuery()
1151         in_meta = BooleanQuery()
1152
1153         for fld in ['themes_pl', 'content']:
1154             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1155
1156         for fld in ['tags', 'authors', 'title']:
1157             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1158
1159         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1160         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1161
1162         topDocs = self.searcher.search(q, only_in, max_results)
1163         for found in topDocs.scoreDocs:
1164             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1165             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1166
1167         return books
1168
1169     # def multisearch(self, query, max_results=50):
1170     #     """
1171     #     Search strategy:
1172     #     - (phrase) OR -> content
1173     #                   -> title
1174     #                   -> authors
1175     #     - (keywords)  -> authors
1176     #                   -> motyw
1177     #                   -> tags
1178     #                   -> content
1179     #     """
1180         # queryreader = StringReader(query)
1181         # tokens = self.get_tokens(queryreader)
1182
1183         # top_level = BooleanQuery()
1184         # Should = BooleanClause.Occur.SHOULD
1185
1186         # phrase_level = BooleanQuery()
1187         # phrase_level.setBoost(1.3)
1188
1189         # p_content = self.make_phrase(tokens, joined=True)
1190         # p_title = self.make_phrase(tokens, 'title')
1191         # p_author = self.make_phrase(tokens, 'author')
1192
1193         # phrase_level.add(BooleanClause(p_content, Should))
1194         # phrase_level.add(BooleanClause(p_title, Should))
1195         # phrase_level.add(BooleanClause(p_author, Should))
1196
1197         # kw_level = BooleanQuery()
1198
1199         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1200         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1201         # kw_level.add(j_themes, Should)
1202         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1203         # j_con = self.make_term_query(tokens, joined=True)
1204         # kw_level.add(j_con, Should)
1205
1206         # top_level.add(BooleanClause(phrase_level, Should))
1207         # top_level.add(BooleanClause(kw_level, Should))
1208
1209         # return None
1210
1211     def get_snippets(self, scoreDoc, query, field='content'):
1212         """
1213         Returns a snippet for found scoreDoc.
1214         """
1215         htmlFormatter = SimpleHTMLFormatter()
1216         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1217
1218         stored = self.searcher.doc(scoreDoc.doc)
1219
1220         position = stored.get('snippets_position')
1221         length = stored.get('snippets_length')
1222         if position is None or length is None:
1223             return None
1224         # locate content.
1225         book_id = int(stored.get('book_id'))
1226         snippets = Snippets(book_id).open()
1227         try:
1228             try:
1229                 text = snippets.get((int(position),
1230                                      int(length)))
1231             finally:
1232                 snippets.close()
1233
1234             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1235             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1236             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1237
1238         except Exception, e:
1239             e2 = e
1240             if hasattr(e, 'getJavaException'):
1241                 e2 = unicode(e.getJavaException())
1242             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1243                 e2)
1244         return snip
1245
1246     @staticmethod
1247     def enum_to_array(enum):
1248         """
1249         Converts a lucene TermEnum to array of Terms, suitable for
1250         addition to queries
1251         """
1252         terms = []
1253
1254         while True:
1255             t = enum.term()
1256             if t:
1257                 terms.append(t)
1258             if not enum.next(): break
1259
1260         if terms:
1261             return JArray('object')(terms, Term)
1262
1263     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1264         """
1265         Search for Tag objects using query.
1266         """
1267         if not pdcounter:
1268             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1269         tops = self.searcher.search(query, filters, max_results)
1270
1271         tags = []
1272         for found in tops.scoreDocs:
1273             doc = self.searcher.doc(found.doc)
1274             is_pdcounter = doc.get('is_pdcounter')
1275             category = doc.get('tag_category')
1276             if is_pdcounter == 'true':
1277                 if category == 'pd_author':
1278                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1279                 elif category == 'pd_book':
1280                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1281                     tag.category = 'pd_book'  # make it look more lik a tag.
1282                 else:
1283                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1284             else:
1285                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1286                 # don't add the pdcounter tag if same tag already exists
1287             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1288                 tags.append(tag)
1289                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1290         print 'returning %s' % tags
1291         return tags
1292
1293     def search_books(self, query, filter=None, max_results=10):
1294         """
1295         Searches for Book objects using query
1296         """
1297         bks = []
1298         tops = self.searcher.search(query, filter, max_results)
1299         for found in tops.scoreDocs:
1300             doc = self.searcher.doc(found.doc)
1301             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1302         return bks
1303
1304     def make_prefix_phrase(self, toks, field):
1305         q = MultiPhraseQuery()
1306         for i in range(len(toks)):
1307             t = Term(field, toks[i])
1308             if i == len(toks) - 1:
1309                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1310                 if pterms:
1311                     q.add(pterms)
1312                 else:
1313                     q.add(t)
1314             else:
1315                 q.add(t)
1316         return q
1317
1318     @staticmethod
1319     def term_filter(term, inverse=False):
1320         only_term = TermsFilter()
1321         only_term.addTerm(term)
1322
1323         if inverse:
1324             neg = BooleanFilter()
1325             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1326             only_term = neg
1327
1328         return only_term
1329
1330     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1331         """
1332         Return auto-complete hints for tags
1333         using prefix search.
1334         """
1335         toks = self.get_tokens(string, field='SIMPLE')
1336         top = BooleanQuery()
1337
1338         for field in ['tag_name', 'tag_name_pl']:
1339             if prefix:
1340                 q = self.make_prefix_phrase(toks, field)
1341             else:
1342                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1343             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1344
1345         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1346
1347         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1348
1349     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1350         """
1351         Returns auto-complete hints for book titles
1352         Because we do not index 'pseudo' title-tags.
1353         Prefix search.
1354         """
1355         toks = self.get_tokens(string, field='SIMPLE')
1356
1357         if prefix:
1358             q = self.make_prefix_phrase(toks, 'title')
1359         else:
1360             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1361
1362         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1363
1364     @staticmethod
1365     def chain_filters(filters, op=ChainedFilter.AND):
1366         """
1367         Chains a filter list together
1368         """
1369         filters = filter(lambda x: x is not None, filters)
1370         if not filters or filters is []:
1371             return None
1372         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1373         return chf
1374
1375     def filtered_categories(self, tags):
1376         """
1377         Return a list of tag categories, present in tags list.
1378         """
1379         cats = {}
1380         for t in tags:
1381             cats[t.category] = True
1382         return cats.keys()
1383
1384     def hint(self):
1385         return Hint(self)