workaround for ie ignoring everything
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, timeout=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
176         if timeout:
177             conf.setWriteLockTimeout(long(timeout))
178         self.index = IndexWriter(self.store, conf)
179         return self.index
180
181     def optimize(self):
182         self.index.optimize()
183
184     def close(self):
185         try:
186             self.index.optimize()
187         except JavaError, je:
188             print "Error during optimize phase, check index: %s" % je
189
190         self.index.close()
191         self.index = None
192
193     def __enter__(self):
194         self.open()
195         return self
196
197     def __exit__(self, type, value, tb):
198         self.close()
199
200
201 class Index(BaseIndex):
202     """
203     Class indexing books.
204     """
205     def __init__(self, analyzer=None):
206         super(Index, self).__init__(analyzer)
207
208     def index_tags(self):
209         """
210         Re-index global tag list.
211         Removes all tags from index, then index them again.
212         Indexed fields include: id, name (with and without polish stems), category
213         """
214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215         self.index.deleteDocuments(q)
216
217         for tag in catalogue.models.Tag.objects.exclude(category='set'):
218             doc = Document()
219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223             self.index.addDocument(doc)
224
225         for pdtag in PDCounterAuthor.objects.all():
226             doc = Document()
227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232             self.index.addDocument(doc)
233
234         for pdtag in PDCounterBook.objects.all():
235             doc = Document()
236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
237             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
238             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
240             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
241             self.index.addDocument(doc)
242
243     def create_book_doc(self, book):
244         """
245         Create a lucene document referring book id.
246         """
247         doc = Document()
248         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
249         if book.parent is not None:
250             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
251         return doc
252
253     def remove_book(self, book):
254         """Removes a book from search index.
255         book - Book instance."""
256         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
257         self.index.deleteDocuments(q)
258
259     def index_book(self, book, book_info=None, overwrite=True):
260         """
261         Indexes the book.
262         Creates a lucene document for extracted metadata
263         and calls self.index_content() to index the contents of the book.
264         """
265         if overwrite:
266             self.remove_book(book)
267
268         book_doc = self.create_book_doc(book)
269         meta_fields = self.extract_metadata(book, book_info)
270         for f in meta_fields.values():
271             if isinstance(f, list) or isinstance(f, tuple):
272                 for elem in f:
273                     book_doc.add(elem)
274             else:
275                 book_doc.add(f)
276         self.index.addDocument(book_doc)
277         del book_doc
278
279         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
280
281     master_tags = [
282         'opowiadanie',
283         'powiesc',
284         'dramat_wierszowany_l',
285         'dramat_wierszowany_lp',
286         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
287         'wywiad',
288         ]
289
290     ignore_content_tags = [
291         'uwaga', 'extra',
292         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
293         'didaskalia',
294         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
295         ]
296
297     footnote_tags = ['pa', 'pt', 'pr', 'pe']
298
299     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
300
301     published_date_re = re.compile("([0-9]+)[\]. ]*$")
302
303     def extract_metadata(self, book, book_info=None):
304         """
305         Extract metadata from book and returns a map of fields keyed by fieldname
306         """
307         fields = {}
308
309         if book_info is None:
310             book_info = dcparser.parse(open(book.xml_file.path))
311
312         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
313         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
314         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
315
316         # validator, name
317         for field in dcparser.BookInfo.FIELDS:
318             if hasattr(book_info, field.name):
319                 if not getattr(book_info, field.name):
320                     continue
321                 # since no type information is available, we use validator
322                 type_indicator = field.validator
323                 if type_indicator == dcparser.as_unicode:
324                     s = getattr(book_info, field.name)
325                     if field.multiple:
326                         s = ', '.join(s)
327                     try:
328                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
329                     except JavaError as je:
330                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
331                 elif type_indicator == dcparser.as_person:
332                     p = getattr(book_info, field.name)
333                     if isinstance(p, dcparser.Person):
334                         persons = unicode(p)
335                     else:
336                         persons = ', '.join(map(unicode, p))
337                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
338                 elif type_indicator == dcparser.as_date:
339                     dt = getattr(book_info, field.name)
340                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
341                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
342
343         # get published date
344         pd = None
345         if hasattr(book_info, 'source_name') and book_info.source_name:
346             match = self.published_date_re.search(book_info.source_name)
347             if match is not None:
348                 pd = str(match.groups()[0])
349         if not pd: pd = ""
350         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
351
352         return fields
353
354     def add_gaps(self, fields, fieldname):
355         """
356         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
357         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
358         """
359         def gap():
360             while True:
361                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
362         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
363
364     def get_master(self, root):
365         """
366         Returns the first master tag from an etree.
367         """
368         for master in root.iter():
369             if master.tag in self.master_tags:
370                 return master
371
372     def index_content(self, book, book_fields=[]):
373         """
374         Walks the book XML and extract content from it.
375         Adds parts for each header tag and for each fragment.
376         """
377         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
378         root = wld.edoc.getroot()
379
380         master = self.get_master(root)
381         if master is None:
382             return []
383
384         def walker(node, ignore_tags=[]):
385
386             if node.tag not in ignore_tags:
387                 yield node, None, None
388                 if node.text is not None:
389                     yield None, node.text, None
390                 for child in list(node):
391                     for b, t, e in walker(child):
392                         yield b, t, e
393                 yield None, None, node
394
395             if node.tail is not None:
396                 yield None, node.tail, None
397             return
398
399         def fix_format(text):
400             #            separator = [u" ", u"\t", u".", u";", u","]
401             if isinstance(text, list):
402                 # need to join it first
403                 text = filter(lambda s: s is not None, content)
404                 text = u' '.join(text)
405                 # for i in range(len(text)):
406                 #     if i > 0:
407                 #         if text[i][0] not in separator\
408                 #             and text[i - 1][-1] not in separator:
409                 #          text.insert(i, u" ")
410
411             return re.sub("(?m)/$", "", text)
412
413         def add_part(snippets, **fields):
414             doc = self.create_book_doc(book)
415             for f in book_fields:
416                 doc.add(f)
417
418             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
419             doc.add(NumericField("header_span", Field.Store.YES, True)\
420                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
421             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
422
423             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
424                           Field.TermVector.WITH_POSITIONS_OFFSETS))
425
426             snip_pos = snippets.add(fields["content"])
427             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
428             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
429
430             if 'fragment_anchor' in fields:
431                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
432                               Field.Store.YES, Field.Index.NOT_ANALYZED))
433
434             if 'themes' in fields:
435                 themes, themes_pl = zip(*[
436                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
437                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
438                      for theme in fields['themes']])
439
440                 themes = self.add_gaps(themes, 'themes')
441                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
442
443                 for t in themes:
444                     doc.add(t)
445                 for t in themes_pl:
446                     doc.add(t)
447
448             return doc
449
450         def give_me_utf8(s):
451             if isinstance(s, unicode):
452                 return s.encode('utf-8')
453             else:
454                 return s
455
456         fragments = {}
457         snippets = Snippets(book.id).open('w')
458         try:
459             for header, position in zip(list(master), range(len(master))):
460
461                 if header.tag in self.skip_header_tags:
462                     continue
463                 if header.tag is etree.Comment:
464                     continue
465
466                 # section content
467                 content = []
468                 footnote = []
469
470                 def all_content(text):
471                     for frag in fragments.values():
472                         frag['content'].append(text)
473                     content.append(text)
474                 handle_text = [all_content]
475
476
477                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
478                     # handle footnotes
479                     if start is not None and start.tag in self.footnote_tags:
480                         footnote = []
481                         def collect_footnote(t):
482                             footnote.append(t)
483                         handle_text.append(collect_footnote)
484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
485                         handle_text.pop()
486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
487                                        content=u''.join(footnote),
488                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
489                 
490                         self.index.addDocument(doc)
491                         #print "@ footnote text: %s" % footnote
492                         footnote = []
493                     
494                     # handle fragments and themes.
495                     if start is not None and start.tag == 'begin':
496                         fid = start.attrib['id'][1:]
497                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
498
499                     # themes for this fragment
500                     elif start is not None and start.tag == 'motyw':
501                         fid = start.attrib['id'][1:]
502                         handle_text.append(None)
503                         if start.text is not None:
504                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
505                     elif end is not None and end.tag == 'motyw':
506                         handle_text.pop()
507
508                     elif start is not None and start.tag == 'end':
509                         fid = start.attrib['id'][1:]
510                         if fid not in fragments:
511                             continue  # a broken <end> node, skip it
512                         frag = fragments[fid]
513                         if frag['themes'] == []:
514                             continue  # empty themes list.
515                         del fragments[fid]
516
517                         doc = add_part(snippets,
518                                        header_type=frag['start_header'],
519                                        header_index=frag['start_section'],
520                                        header_span=position - frag['start_section'] + 1,
521                                        fragment_anchor=fid,
522                                        content=fix_format(frag['content']),
523                                        themes=frag['themes'])
524                         #print '@ FRAG %s' % frag['content']
525                         self.index.addDocument(doc)
526
527                         # Collect content.
528
529                     if text is not None and handle_text is not []:
530                         hdl = handle_text[-1]
531                         if hdl is not None:
532                             hdl(text)
533
534                         # in the end, add a section text.
535                 doc = add_part(snippets, header_index=position, header_type=header.tag,
536                                content=fix_format(content))
537                 #print '@ CONTENT: %s' % fix_format(content)
538
539                 self.index.addDocument(doc)
540
541         finally:
542             snippets.close()
543
544
545 def log_exception_wrapper(f):
546     def _wrap(*a):
547         try:
548             f(*a)
549         except Exception, e:
550             print("Error in indexing thread: %s" % e)
551             traceback.print_exc()
552             raise e
553     return _wrap
554
555
556 class ReusableIndex(Index):
557     """
558     Works like index, but does not close/optimize Lucene index
559     until program exit (uses atexit hook).
560     This is usefull for importbooks command.
561
562     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
563     """
564     index = None
565
566     def open(self, analyzer=None, **kw):
567         if ReusableIndex.index:
568             self.index = ReusableIndex.index
569         else:
570             print("opening index")
571             Index.open(self, analyzer, **kw)
572             ReusableIndex.index = self.index
573             atexit.register(ReusableIndex.close_reusable)
574
575     # def index_book(self, *args, **kw):
576     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
577     #     ReusableIndex.pool_jobs.append(job)
578
579     @staticmethod
580     def close_reusable():
581         if ReusableIndex.index:
582             print("closing index")
583             ReusableIndex.index.optimize()
584             ReusableIndex.index.close()
585             ReusableIndex.index = None
586
587     def close(self):
588         if ReusableIndex.index:
589             ReusableIndex.index.commit()
590
591
592 class JoinSearch(object):
593     """
594     This mixin could be used to handle block join queries.
595     (currently unused)
596     """
597     def __init__(self, *args, **kw):
598         super(JoinSearch, self).__init__(*args, **kw)
599
600     def wrapjoins(self, query, fields=[]):
601         """
602         This functions modifies the query in a recursive way,
603         so Term and Phrase Queries contained, which match
604         provided fields are wrapped in a BlockJoinQuery,
605         and so delegated to children documents.
606         """
607         if BooleanQuery.instance_(query):
608             qs = BooleanQuery.cast_(query)
609             for clause in qs:
610                 clause = BooleanClause.cast_(clause)
611                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
612             return qs
613         else:
614             termset = HashSet()
615             query.extractTerms(termset)
616             for t in termset:
617                 t = Term.cast_(t)
618                 if t.field() not in fields:
619                     return query
620             return BlockJoinQuery(query, self.parent_filter,
621                                   BlockJoinQuery.ScoreMode.Total)
622
623     def bsearch(self, query, max_results=50):
624         q = self.query(query)
625         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
626
627         tops = self.searcher.search(bjq, max_results)
628         bks = []
629         for found in tops.scoreDocs:
630             doc = self.searcher.doc(found.doc)
631             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
632         return (bks, tops.totalHits)
633
634
635 class SearchResult(object):
636     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
637         if tokens_cache is None: tokens_cache = {}
638
639         if score:
640             self._score = score
641         else:
642             self._score = scoreDocs.score
643
644         self.boost = 1.0
645
646         self._hits = []
647         self._processed_hits = None  # processed hits
648
649         stored = search.searcher.doc(scoreDocs.doc)
650         self.book_id = int(stored.get("book_id"))
651
652         pd = stored.get("published_date")
653         try:
654             self.published_date = int(pd)
655         except ValueError:
656             self.published_date = 0
657
658         header_type = stored.get("header_type")
659         # we have a content hit in some header of fragment
660         if header_type is not None:
661             sec = (header_type, int(stored.get("header_index")))
662             header_span = stored.get('header_span')
663             header_span = header_span is not None and int(header_span) or 1
664
665             fragment = stored.get("fragment_anchor")
666
667             if snippets:
668                 snippets = snippets.replace("/\n", "\n")
669             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
670
671             self._hits.append(hit)
672
673         self.search = search
674         self.searched = searched
675         self.tokens_cache = tokens_cache
676
677     @property
678     def score(self):
679         return self._score * self.boost
680
681     def merge(self, other):
682         if self.book_id != other.book_id:
683             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
684         self._hits += other._hits
685         if other.score > self.score:
686             self._score = other._score
687         return self
688
689     def get_book(self):
690         return catalogue.models.Book.objects.get(id=self.book_id)
691
692     book = property(get_book)
693
694     @property
695     def hits(self):
696         if self._processed_hits is not None:
697             return self._processed_hits
698
699         POSITION = 0
700         FRAGMENT = 1
701         POSITION_INDEX = 1
702         POSITION_SPAN = 2
703         SCORE = 2
704         OTHER = 3
705
706         # to sections and fragments
707         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
708         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
709         sect = filter(lambda s: 0 == len(filter(
710             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
711             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
712             frags)), sect)
713
714         hits = []
715
716         # remove duplicate fragments
717         fragments = {}
718         for f in frags:
719             fid = f[FRAGMENT]
720             if fid in fragments:
721                 if fragments[fid][SCORE] >= f[SCORE]:
722                     continue
723             fragments[fid] = f
724         frags = fragments.values()
725
726         # remove duplicate sections
727         sections = {}
728
729         for s in sect:
730             si = s[POSITION][POSITION_INDEX]
731             # skip existing
732             if si in sections:
733                 if sections[si]['score'] >= s[SCORE]:
734                     continue
735
736             m = {'score': s[SCORE],
737                  'section_number': s[POSITION][POSITION_INDEX] + 1,
738                  }
739             m.update(s[OTHER])
740             sections[si] = m
741
742         hits = sections.values()
743
744         for f in frags:
745             try:
746                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
747             except catalogue.models.Fragment.DoesNotExist:
748                 # stale index
749                 continue
750
751             # Figure out if we were searching for a token matching some word in theme name.
752             themes = frag.tags.filter(category='theme')
753             themes_hit = []
754             if self.searched is not None:
755                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
756                 for theme in themes:
757                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
758                     for t in tokens:
759                         if t in name_tokens:
760                             if not theme in themes_hit:
761                                 themes_hit.append(theme)
762                             break
763
764             m = {'score': f[SCORE],
765                  'fragment': frag,
766                  'section_number': f[POSITION][POSITION_INDEX] + 1,
767                  'themes': themes,
768                  'themes_hit': themes_hit
769                  }
770             m.update(f[OTHER])
771             hits.append(m)
772
773         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
774
775         self._processed_hits = hits
776
777         return hits
778
779     def __unicode__(self):
780         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
781
782     @staticmethod
783     def aggregate(*result_lists):
784         books = {}
785         for rl in result_lists:
786             for r in rl:
787                 if r.book_id in books:
788                     books[r.book_id].merge(r)
789                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
790                 else:
791                     books[r.book_id] = r
792         return books.values()
793
794     def __cmp__(self, other):
795         c = cmp(self.score, other.score)
796         if c == 0:
797             # this is inverted, because earlier date is better
798             return cmp(other.published_date, self.published_date)
799         else:
800             return c
801
802
803 class Hint(object):
804     """
805     Given some hint information (information we already know about)
806     our search target - like author, title (specific book), epoch, genre, kind
807     we can narrow down search using filters.
808     """
809     def __init__(self, search):
810         """
811         Accepts a Searcher instance.
812         """
813         self.search = search
814         self.book_tags = {}
815         self.part_tags = []
816         self._books = []
817
818     def books(self, *books):
819         """
820         Give a hint that we search these books.
821         """
822         self._books = books
823
824     def tags(self, tags):
825         """
826         Give a hint that these Tag objects (a list of)
827         is necessary.
828         """
829         for t in tags:
830             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
831                 lst = self.book_tags.get(t.category, [])
832                 lst.append(t)
833                 self.book_tags[t.category] = lst
834             if t.category in ['theme', 'theme_pl']:
835                 self.part_tags.append(t)
836
837     def tag_filter(self, tags, field='tags'):
838         """
839         Given a lsit of tags and an optional field (but they are normally in tags field)
840         returns a filter accepting only books with specific tags.
841         """
842         q = BooleanQuery()
843
844         for tag in tags:
845             toks = self.search.get_tokens(tag.name, field=field)
846             tag_phrase = PhraseQuery()
847             for tok in toks:
848                 tag_phrase.add(Term(field, tok))
849             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
850
851         return QueryWrapperFilter(q)
852
853     def book_filter(self):
854         """
855         Filters using book tags (all tag kinds except a theme)
856         """
857         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
858         if tags:
859             return self.tag_filter(tags)
860         else:
861             return None
862
863     def part_filter(self):
864         """
865         This filter can be used to look for book parts.
866         It filters on book id and/or themes.
867         """
868         fs = []
869         if self.part_tags:
870             fs.append(self.tag_filter(self.part_tags, field='themes'))
871
872         if self._books != []:
873             bf = BooleanFilter()
874             for b in self._books:
875                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
876                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
877             fs.append(bf)
878
879         return Search.chain_filters(fs)
880
881     def should_search_for_book(self):
882         return self._books == []
883
884     def just_search_in(self, all):
885         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
886         some = []
887         for field in all:
888             if field == 'authors' and 'author' in self.book_tags:
889                 continue
890             if field == 'title' and self._books != []:
891                 continue
892             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
893                 continue
894             some.append(field)
895         return some
896
897
898 class Search(IndexStore):
899     """
900     Search facilities.
901     """
902     def __init__(self, default_field="content"):
903         IndexStore.__init__(self)
904         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
905         # self.analyzer = WLAnalyzer()
906         self.searcher = IndexSearcher(self.store, True)
907         self.parser = QueryParser(Version.LUCENE_34, default_field,
908                                   self.analyzer)
909
910         self.parent_filter = TermsFilter()
911         self.parent_filter.addTerm(Term("is_book", "true"))
912
913     def query(self, query):
914         """Parse query in default Lucene Syntax. (for humans)
915         """
916         return self.parser.parse(query)
917
918     def simple_search(self, query, max_results=50):
919         """Runs a query for books using lucene syntax. (for humans)
920         Returns (books, total_hits)
921         """
922
923         tops = self.searcher.search(self.query(query), max_results)
924         bks = []
925         for found in tops.scoreDocs:
926             doc = self.searcher.doc(found.doc)
927             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
928         return (bks, tops.totalHits)
929
930     def get_tokens(self, searched, field='content', cached=None):
931         """returns tokens analyzed by a proper (for a field) analyzer
932         argument can be: StringReader, string/unicode, or tokens. In the last case
933         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
934         """
935         if cached is not None and field in cached:
936             return cached[field]
937
938         if isinstance(searched, str) or isinstance(searched, unicode):
939             searched = StringReader(searched)
940         elif isinstance(searched, list):
941             return searched
942
943         searched.reset()
944         tokens = self.analyzer.reusableTokenStream(field, searched)
945         toks = []
946         while tokens.incrementToken():
947             cta = tokens.getAttribute(CharTermAttribute.class_)
948             toks.append(cta.toString())
949
950         if cached is not None:
951             cached[field] = toks
952
953         return toks
954
955     def fuzziness(self, fuzzy):
956         """Helper method to sanitize fuzziness"""
957         if not fuzzy:
958             return None
959         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
960             return fuzzy
961         else:
962             return 0.5
963
964     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
965         """
966         Return a PhraseQuery with a series of tokens.
967         """
968         if fuzzy:
969             phrase = MultiPhraseQuery()
970             for t in tokens:
971                 term = Term(field, t)
972                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
973                 fuzzterms = []
974
975                 while True:
976                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
977                     ft = fuzzterm.term()
978                     if ft:
979                         fuzzterms.append(ft)
980                     if not fuzzterm.next(): break
981                 if fuzzterms:
982                     phrase.add(JArray('object')(fuzzterms, Term))
983                 else:
984                     phrase.add(term)
985         else:
986             phrase = PhraseQuery()
987             phrase.setSlop(slop)
988             for t in tokens:
989                 term = Term(field, t)
990                 phrase.add(term)
991         return phrase
992
993     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
994         """
995         Returns term queries joined by boolean query.
996         modal - applies to boolean query
997         fuzzy - should the query by fuzzy.
998         """
999         q = BooleanQuery()
1000         for t in tokens:
1001             term = Term(field, t)
1002             if fuzzy:
1003                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1004             else:
1005                 term = TermQuery(term)
1006             q.add(BooleanClause(term, modal))
1007         return q
1008
1009     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011         if filters is None: filters = []
1012         if tokens_cache is None: tokens_cache = {}
1013
1014         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1015
1016         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1020
1021         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1022
1023     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024                     filters=None, tokens_cache=None, boost=None, snippets=True):
1025         if filters is None: filters = []
1026         if tokens_cache is None: tokens_cache = {}
1027
1028         if book:
1029             filters.append(self.term_filter(Term('is_book', 'true')))
1030
1031         query = BooleanQuery()
1032
1033         for fld in fields:
1034             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1035
1036             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1038
1039         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1040
1041         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1043
1044     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1045         """
1046         Search for perfect book matches. Just see if the query matches with some author or title,
1047         taking hints into account.
1048         """
1049         fields_to_search = ['authors', 'title']
1050         only_in = None
1051         if hint:
1052             if not hint.should_search_for_book():
1053                 return []
1054             fields_to_search = hint.just_search_in(fields_to_search)
1055             only_in = hint.book_filter()
1056
1057         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1058
1059         books = []
1060         for q in qrys:
1061             top = self.searcher.search(q,
1062                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1063                 max_results)
1064             for found in top.scoreDocs:
1065                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1066         return books
1067
1068     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069         fields_to_search = ['tags', 'authors', 'title']
1070
1071         only_in = None
1072         if hint:
1073             if not hint.should_search_for_book():
1074                 return []
1075             fields_to_search = hint.just_search_in(fields_to_search)
1076             only_in = hint.book_filter()
1077
1078         tokens = self.get_tokens(searched, field='SIMPLE')
1079
1080         q = BooleanQuery()
1081
1082         for fld in fields_to_search:
1083             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1085
1086         books = []
1087         top = self.searcher.search(q,
1088                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1089             max_results)
1090         for found in top.scoreDocs:
1091             books.append(SearchResult(self, found, how_found="search_book"))
1092
1093         return books
1094
1095     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1096         """
1097         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098         some part/fragment of the book.
1099         """
1100         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1101
1102         flt = None
1103         if hint:
1104             flt = hint.part_filter()
1105
1106         books = []
1107         for q in qrys:
1108             top = self.searcher.search(q,
1109                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1110                                                            flt]),
1111                                        max_results)
1112             for found in top.scoreDocs:
1113                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1114
1115         return books
1116
1117     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1118         """
1119         Tries to use search terms to match different fields of book (or its parts).
1120         E.g. one word can be an author survey, another be a part of the title, and the rest
1121         are some words from third chapter.
1122         """
1123         if tokens_cache is None: tokens_cache = {}
1124         books = []
1125         only_in = None
1126
1127         if hint:
1128             only_in = hint.part_filter()
1129
1130         # content only query : themes x content
1131         q = BooleanQuery()
1132
1133         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1135
1136         # only search in themes when we do not already filter by themes
1137         if hint is None or hint.just_search_in(['themes']) != []:
1138             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1140
1141         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1143
1144         topDocs = self.searcher.search(q, only_in, max_results)
1145         for found in topDocs.scoreDocs:
1146             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1148
1149         # query themes/content x author/title/tags
1150         q = BooleanQuery()
1151         in_content = BooleanQuery()
1152         in_meta = BooleanQuery()
1153
1154         for fld in ['themes_pl', 'content']:
1155             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1156
1157         for fld in ['tags', 'authors', 'title']:
1158             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1159
1160         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1162
1163         topDocs = self.searcher.search(q, only_in, max_results)
1164         for found in topDocs.scoreDocs:
1165             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1167
1168         return books
1169
1170     # def multisearch(self, query, max_results=50):
1171     #     """
1172     #     Search strategy:
1173     #     - (phrase) OR -> content
1174     #                   -> title
1175     #                   -> authors
1176     #     - (keywords)  -> authors
1177     #                   -> motyw
1178     #                   -> tags
1179     #                   -> content
1180     #     """
1181         # queryreader = StringReader(query)
1182         # tokens = self.get_tokens(queryreader)
1183
1184         # top_level = BooleanQuery()
1185         # Should = BooleanClause.Occur.SHOULD
1186
1187         # phrase_level = BooleanQuery()
1188         # phrase_level.setBoost(1.3)
1189
1190         # p_content = self.make_phrase(tokens, joined=True)
1191         # p_title = self.make_phrase(tokens, 'title')
1192         # p_author = self.make_phrase(tokens, 'author')
1193
1194         # phrase_level.add(BooleanClause(p_content, Should))
1195         # phrase_level.add(BooleanClause(p_title, Should))
1196         # phrase_level.add(BooleanClause(p_author, Should))
1197
1198         # kw_level = BooleanQuery()
1199
1200         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202         # kw_level.add(j_themes, Should)
1203         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204         # j_con = self.make_term_query(tokens, joined=True)
1205         # kw_level.add(j_con, Should)
1206
1207         # top_level.add(BooleanClause(phrase_level, Should))
1208         # top_level.add(BooleanClause(kw_level, Should))
1209
1210         # return None
1211
1212     def get_snippets(self, scoreDoc, query, field='content'):
1213         """
1214         Returns a snippet for found scoreDoc.
1215         """
1216         htmlFormatter = SimpleHTMLFormatter()
1217         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1218
1219         stored = self.searcher.doc(scoreDoc.doc)
1220
1221         position = stored.get('snippets_position')
1222         length = stored.get('snippets_length')
1223         if position is None or length is None:
1224             return None
1225         # locate content.
1226         book_id = int(stored.get('book_id'))
1227         snippets = Snippets(book_id).open()
1228         try:
1229             try:
1230                 text = snippets.get((int(position),
1231                                      int(length)))
1232             finally:
1233                 snippets.close()
1234
1235             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1238
1239         except Exception, e:
1240             e2 = e
1241             if hasattr(e, 'getJavaException'):
1242                 e2 = unicode(e.getJavaException())
1243             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1244                 e2)
1245         return snip
1246
1247     @staticmethod
1248     def enum_to_array(enum):
1249         """
1250         Converts a lucene TermEnum to array of Terms, suitable for
1251         addition to queries
1252         """
1253         terms = []
1254
1255         while True:
1256             t = enum.term()
1257             if t:
1258                 terms.append(t)
1259             if not enum.next(): break
1260
1261         if terms:
1262             return JArray('object')(terms, Term)
1263
1264     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1265         """
1266         Search for Tag objects using query.
1267         """
1268         if not pdcounter:
1269             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270         tops = self.searcher.search(query, filters, max_results)
1271
1272         tags = []
1273         for found in tops.scoreDocs:
1274             doc = self.searcher.doc(found.doc)
1275             is_pdcounter = doc.get('is_pdcounter')
1276             category = doc.get('tag_category')
1277             if is_pdcounter == 'true':
1278                 if category == 'pd_author':
1279                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280                 elif category == 'pd_book':
1281                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282                     tag.category = 'pd_book'  # make it look more lik a tag.
1283                 else:
1284                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1285             else:
1286                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287                 # don't add the pdcounter tag if same tag already exists
1288             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1289                 tags.append(tag)
1290                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291         print 'returning %s' % tags
1292         return tags
1293
1294     def search_books(self, query, filter=None, max_results=10):
1295         """
1296         Searches for Book objects using query
1297         """
1298         bks = []
1299         tops = self.searcher.search(query, filter, max_results)
1300         for found in tops.scoreDocs:
1301             doc = self.searcher.doc(found.doc)
1302             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1303         return bks
1304
1305     def make_prefix_phrase(self, toks, field):
1306         q = MultiPhraseQuery()
1307         for i in range(len(toks)):
1308             t = Term(field, toks[i])
1309             if i == len(toks) - 1:
1310                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1311                 if pterms:
1312                     q.add(pterms)
1313                 else:
1314                     q.add(t)
1315             else:
1316                 q.add(t)
1317         return q
1318
1319     @staticmethod
1320     def term_filter(term, inverse=False):
1321         only_term = TermsFilter()
1322         only_term.addTerm(term)
1323
1324         if inverse:
1325             neg = BooleanFilter()
1326             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1327             only_term = neg
1328
1329         return only_term
1330
1331     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1332         """
1333         Return auto-complete hints for tags
1334         using prefix search.
1335         """
1336         toks = self.get_tokens(string, field='SIMPLE')
1337         top = BooleanQuery()
1338
1339         for field in ['tag_name', 'tag_name_pl']:
1340             if prefix:
1341                 q = self.make_prefix_phrase(toks, field)
1342             else:
1343                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1345
1346         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1347
1348         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1349
1350     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1351         """
1352         Returns auto-complete hints for book titles
1353         Because we do not index 'pseudo' title-tags.
1354         Prefix search.
1355         """
1356         toks = self.get_tokens(string, field='SIMPLE')
1357
1358         if prefix:
1359             q = self.make_prefix_phrase(toks, 'title')
1360         else:
1361             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1362
1363         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1364
1365     @staticmethod
1366     def chain_filters(filters, op=ChainedFilter.AND):
1367         """
1368         Chains a filter list together
1369         """
1370         filters = filter(lambda x: x is not None, filters)
1371         if not filters or filters is []:
1372             return None
1373         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1374         return chf
1375
1376     def filtered_categories(self, tags):
1377         """
1378         Return a list of tag categories, present in tags list.
1379         """
1380         cats = {}
1381         for t in tags:
1382             cats[t.category] = True
1383         return cats.keys()
1384
1385     def hint(self):
1386         return Hint(self)