add pdcounter book as search tag
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, timeout=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
176         if timeout:
177             conf.setWriteLockTimeout(long(timeout))
178         self.index = IndexWriter(self.store, conf)
179         return self.index
180
181     def optimize(self):
182         self.index.optimize()
183
184     def close(self):
185         try:
186             self.index.optimize()
187         except JavaError, je:
188             print "Error during optimize phase, check index: %s" % je
189
190         self.index.close()
191         self.index = None
192
193     def __enter__(self):
194         self.open()
195         return self
196
197     def __exit__(self, type, value, tb):
198         self.close()
199
200
201 class Index(BaseIndex):
202     """
203     Class indexing books.
204     """
205     def __init__(self, analyzer=None):
206         super(Index, self).__init__(analyzer)
207
208     def index_tags(self):
209         """
210         Re-index global tag list.
211         Removes all tags from index, then index them again.
212         Indexed fields include: id, name (with and without polish stems), category
213         """
214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215         self.index.deleteDocuments(q)
216
217         for tag in catalogue.models.Tag.objects.all():
218             doc = Document()
219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223             self.index.addDocument(doc)
224
225         for pdtag in PDCounterAuthor.objects.all():
226             doc = Document()
227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232             self.index.addDocument(doc)
233
234         for pdtag in PDCounterBook.objects.all():
235             doc = Document()
236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
237             print pdtag.title
238             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
240             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
241             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
242             self.index.addDocument(doc)
243
244     def create_book_doc(self, book):
245         """
246         Create a lucene document referring book id.
247         """
248         doc = Document()
249         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
250         if book.parent is not None:
251             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
252         return doc
253
254     def remove_book(self, book):
255         """Removes a book from search index.
256         book - Book instance."""
257         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
258         self.index.deleteDocuments(q)
259
260     def index_book(self, book, book_info=None, overwrite=True):
261         """
262         Indexes the book.
263         Creates a lucene document for extracted metadata
264         and calls self.index_content() to index the contents of the book.
265         """
266         if overwrite:
267             self.remove_book(book)
268
269         book_doc = self.create_book_doc(book)
270         meta_fields = self.extract_metadata(book, book_info)
271         for f in meta_fields.values():
272             if isinstance(f, list) or isinstance(f, tuple):
273                 for elem in f:
274                     book_doc.add(elem)
275             else:
276                 book_doc.add(f)
277         self.index.addDocument(book_doc)
278         del book_doc
279
280         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
281
282     master_tags = [
283         'opowiadanie',
284         'powiesc',
285         'dramat_wierszowany_l',
286         'dramat_wierszowany_lp',
287         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
288         'wywiad',
289         ]
290
291     ignore_content_tags = [
292         'uwaga', 'extra',
293         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
294         'didaskalia',
295         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
296         ]
297
298     footnote_tags = ['pa', 'pt', 'pr', 'pe']
299
300     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
301
302     published_date_re = re.compile("([0-9]+)[\]. ]*$")
303
304     def extract_metadata(self, book, book_info=None):
305         """
306         Extract metadata from book and returns a map of fields keyed by fieldname
307         """
308         fields = {}
309
310         if book_info is None:
311             book_info = dcparser.parse(open(book.xml_file.path))
312
313         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
314         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
315         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
316
317         # validator, name
318         for field in dcparser.BookInfo.FIELDS:
319             if hasattr(book_info, field.name):
320                 if not getattr(book_info, field.name):
321                     continue
322                 # since no type information is available, we use validator
323                 type_indicator = field.validator
324                 if type_indicator == dcparser.as_unicode:
325                     s = getattr(book_info, field.name)
326                     if field.multiple:
327                         s = ', '.join(s)
328                     try:
329                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
330                     except JavaError as je:
331                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
332                 elif type_indicator == dcparser.as_person:
333                     p = getattr(book_info, field.name)
334                     if isinstance(p, dcparser.Person):
335                         persons = unicode(p)
336                     else:
337                         persons = ', '.join(map(unicode, p))
338                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
339                 elif type_indicator == dcparser.as_date:
340                     dt = getattr(book_info, field.name)
341                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
342                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
343
344         # get published date
345         pd = None
346         if hasattr(book_info, 'source_name') and book_info.source_name:
347             match = self.published_date_re.search(book_info.source_name)
348             if match is not None:
349                 pd = str(match.groups()[0])
350         if not pd: pd = ""
351         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
352
353         return fields
354
355     def add_gaps(self, fields, fieldname):
356         """
357         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
358         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
359         """
360         def gap():
361             while True:
362                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
363         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
364
365     def get_master(self, root):
366         """
367         Returns the first master tag from an etree.
368         """
369         for master in root.iter():
370             if master.tag in self.master_tags:
371                 return master
372
373     def index_content(self, book, book_fields=[]):
374         """
375         Walks the book XML and extract content from it.
376         Adds parts for each header tag and for each fragment.
377         """
378         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
379         root = wld.edoc.getroot()
380
381         master = self.get_master(root)
382         if master is None:
383             return []
384
385         def walker(node, ignore_tags=[]):
386
387             if node.tag not in ignore_tags:
388                 yield node, None, None
389                 if node.text is not None:
390                     yield None, node.text, None
391                 for child in list(node):
392                     for b, t, e in walker(child):
393                         yield b, t, e
394                 yield None, None, node
395
396             if node.tail is not None:
397                 yield None, node.tail, None
398             return
399
400         def fix_format(text):
401             #            separator = [u" ", u"\t", u".", u";", u","]
402             if isinstance(text, list):
403                 # need to join it first
404                 text = filter(lambda s: s is not None, content)
405                 text = u' '.join(text)
406                 # for i in range(len(text)):
407                 #     if i > 0:
408                 #         if text[i][0] not in separator\
409                 #             and text[i - 1][-1] not in separator:
410                 #          text.insert(i, u" ")
411
412             return re.sub("(?m)/$", "", text)
413
414         def add_part(snippets, **fields):
415             doc = self.create_book_doc(book)
416             for f in book_fields:
417                 doc.add(f)
418
419             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
420             doc.add(NumericField("header_span", Field.Store.YES, True)\
421                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
422             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
423
424             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
425                           Field.TermVector.WITH_POSITIONS_OFFSETS))
426
427             snip_pos = snippets.add(fields["content"])
428             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
429             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
430
431             if 'fragment_anchor' in fields:
432                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
433                               Field.Store.YES, Field.Index.NOT_ANALYZED))
434
435             if 'themes' in fields:
436                 themes, themes_pl = zip(*[
437                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
438                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
439                      for theme in fields['themes']])
440
441                 themes = self.add_gaps(themes, 'themes')
442                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
443
444                 for t in themes:
445                     doc.add(t)
446                 for t in themes_pl:
447                     doc.add(t)
448
449             return doc
450
451         def give_me_utf8(s):
452             if isinstance(s, unicode):
453                 return s.encode('utf-8')
454             else:
455                 return s
456
457         fragments = {}
458         snippets = Snippets(book.id).open('w')
459         try:
460             for header, position in zip(list(master), range(len(master))):
461
462                 if header.tag in self.skip_header_tags:
463                     continue
464                 if header.tag is etree.Comment:
465                     continue
466
467                 # section content
468                 content = []
469                 footnote = []
470
471                 def all_content(text):
472                     for frag in fragments.values():
473                         frag['content'].append(text)
474                     content.append(text)
475                 handle_text = [all_content]
476
477
478                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
479                     # handle footnotes
480                     if start is not None and start.tag in self.footnote_tags:
481                         footnote = []
482                         def collect_footnote(t):
483                             footnote.append(t)
484                         handle_text.append(collect_footnote)
485                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
486                         handle_text.pop()
487                         doc = add_part(snippets, header_index=position, header_type=header.tag,
488                                        content=u''.join(footnote),
489                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
490                 
491                         self.index.addDocument(doc)
492                         #print "@ footnote text: %s" % footnote
493                         footnote = []
494                     
495                     # handle fragments and themes.
496                     if start is not None and start.tag == 'begin':
497                         fid = start.attrib['id'][1:]
498                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
499
500                     # themes for this fragment
501                     elif start is not None and start.tag == 'motyw':
502                         fid = start.attrib['id'][1:]
503                         handle_text.append(None)
504                         if start.text is not None:
505                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
506                     elif end is not None and end.tag == 'motyw':
507                         handle_text.pop()
508
509                     elif start is not None and start.tag == 'end':
510                         fid = start.attrib['id'][1:]
511                         if fid not in fragments:
512                             continue  # a broken <end> node, skip it
513                         frag = fragments[fid]
514                         if frag['themes'] == []:
515                             continue  # empty themes list.
516                         del fragments[fid]
517
518                         doc = add_part(snippets,
519                                        header_type=frag['start_header'],
520                                        header_index=frag['start_section'],
521                                        header_span=position - frag['start_section'] + 1,
522                                        fragment_anchor=fid,
523                                        content=fix_format(frag['content']),
524                                        themes=frag['themes'])
525                         #print '@ FRAG %s' % frag['content']
526                         self.index.addDocument(doc)
527
528                         # Collect content.
529
530                     if text is not None and handle_text is not []:
531                         hdl = handle_text[-1]
532                         if hdl is not None:
533                             hdl(text)
534
535                         # in the end, add a section text.
536                 doc = add_part(snippets, header_index=position, header_type=header.tag,
537                                content=fix_format(content))
538                 #print '@ CONTENT: %s' % fix_format(content)
539
540                 self.index.addDocument(doc)
541
542         finally:
543             snippets.close()
544
545
546 def log_exception_wrapper(f):
547     def _wrap(*a):
548         try:
549             f(*a)
550         except Exception, e:
551             print("Error in indexing thread: %s" % e)
552             traceback.print_exc()
553             raise e
554     return _wrap
555
556
557 class ReusableIndex(Index):
558     """
559     Works like index, but does not close/optimize Lucene index
560     until program exit (uses atexit hook).
561     This is usefull for importbooks command.
562
563     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
564     """
565     index = None
566
567     def open(self, analyzer=None, **kw):
568         if ReusableIndex.index:
569             self.index = ReusableIndex.index
570         else:
571             print("opening index")
572             Index.open(self, analyzer, **kw)
573             ReusableIndex.index = self.index
574             atexit.register(ReusableIndex.close_reusable)
575
576     # def index_book(self, *args, **kw):
577     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
578     #     ReusableIndex.pool_jobs.append(job)
579
580     @staticmethod
581     def close_reusable():
582         if ReusableIndex.index:
583             print("closing index")
584             ReusableIndex.index.optimize()
585             ReusableIndex.index.close()
586             ReusableIndex.index = None
587
588     def close(self):
589         if ReusableIndex.index:
590             ReusableIndex.index.commit()
591
592
593 class JoinSearch(object):
594     """
595     This mixin could be used to handle block join queries.
596     (currently unused)
597     """
598     def __init__(self, *args, **kw):
599         super(JoinSearch, self).__init__(*args, **kw)
600
601     def wrapjoins(self, query, fields=[]):
602         """
603         This functions modifies the query in a recursive way,
604         so Term and Phrase Queries contained, which match
605         provided fields are wrapped in a BlockJoinQuery,
606         and so delegated to children documents.
607         """
608         if BooleanQuery.instance_(query):
609             qs = BooleanQuery.cast_(query)
610             for clause in qs:
611                 clause = BooleanClause.cast_(clause)
612                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
613             return qs
614         else:
615             termset = HashSet()
616             query.extractTerms(termset)
617             for t in termset:
618                 t = Term.cast_(t)
619                 if t.field() not in fields:
620                     return query
621             return BlockJoinQuery(query, self.parent_filter,
622                                   BlockJoinQuery.ScoreMode.Total)
623
624     def bsearch(self, query, max_results=50):
625         q = self.query(query)
626         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
627
628         tops = self.searcher.search(bjq, max_results)
629         bks = []
630         for found in tops.scoreDocs:
631             doc = self.searcher.doc(found.doc)
632             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
633         return (bks, tops.totalHits)
634
635
636 class SearchResult(object):
637     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
638         if tokens_cache is None: tokens_cache = {}
639
640         if score:
641             self._score = score
642         else:
643             self._score = scoreDocs.score
644
645         self.boost = 1.0
646
647         self._hits = []
648         self._processed_hits = None  # processed hits
649
650         stored = search.searcher.doc(scoreDocs.doc)
651         self.book_id = int(stored.get("book_id"))
652
653         pd = stored.get("published_date")
654         if pd is None:
655             pd = 0
656         self.published_date = int(pd)
657
658         header_type = stored.get("header_type")
659         # we have a content hit in some header of fragment
660         if header_type is not None:
661             sec = (header_type, int(stored.get("header_index")))
662             header_span = stored.get('header_span')
663             header_span = header_span is not None and int(header_span) or 1
664
665             fragment = stored.get("fragment_anchor")
666
667             if snippets:
668                 snippets = snippets.replace("/\n", "\n")
669             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
670
671             self._hits.append(hit)
672
673         self.search = search
674         self.searched = searched
675         self.tokens_cache = tokens_cache
676
677     @property
678     def score(self):
679         return self._score * self.boost
680
681     def merge(self, other):
682         if self.book_id != other.book_id:
683             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
684         self._hits += other._hits
685         if other.score > self.score:
686             self._score = other._score
687         return self
688
689     def get_book(self):
690         return catalogue.models.Book.objects.get(id=self.book_id)
691
692     book = property(get_book)
693
694     @property
695     def hits(self):
696         if self._processed_hits is not None:
697             return self._processed_hits
698
699         POSITION = 0
700         FRAGMENT = 1
701         POSITION_INDEX = 1
702         POSITION_SPAN = 2
703         SCORE = 2
704         OTHER = 3
705
706         # to sections and fragments
707         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
708         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
709         sect = filter(lambda s: 0 == len(filter(
710             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
711             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
712             frags)), sect)
713
714         hits = []
715
716         # remove duplicate fragments
717         fragments = {}
718         for f in frags:
719             fid = f[FRAGMENT]
720             if fid in fragments:
721                 if fragments[fid][SCORE] >= f[SCORE]:
722                     continue
723             fragments[fid] = f
724         frags = fragments.values()
725
726         # remove duplicate sections
727         sections = {}
728
729         for s in sect:
730             si = s[POSITION][POSITION_INDEX]
731             # skip existing
732             if si in sections:
733                 if sections[si]['score'] >= s[SCORE]:
734                     continue
735
736             m = {'score': s[SCORE],
737                  'section_number': s[POSITION][POSITION_INDEX] + 1,
738                  }
739             m.update(s[OTHER])
740             sections[si] = m
741
742         hits = sections.values()
743
744         for f in frags:
745             try:
746                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
747             except catalogue.models.Fragment.DoesNotExist:
748                 # stale index
749                 continue
750
751             # Figure out if we were searching for a token matching some word in theme name.
752             themes = frag.tags.filter(category='theme')
753             themes_hit = []
754             if self.searched is not None:
755                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
756                 for theme in themes:
757                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
758                     for t in tokens:
759                         if t in name_tokens:
760                             if not theme in themes_hit:
761                                 themes_hit.append(theme)
762                             break
763
764             m = {'score': f[SCORE],
765                  'fragment': frag,
766                  'section_number': f[POSITION][POSITION_INDEX] + 1,
767                  'themes': themes,
768                  'themes_hit': themes_hit
769                  }
770             m.update(f[OTHER])
771             hits.append(m)
772
773         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
774
775         self._processed_hits = hits
776
777         return hits
778
779     def __unicode__(self):
780         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
781
782     @staticmethod
783     def aggregate(*result_lists):
784         books = {}
785         for rl in result_lists:
786             for r in rl:
787                 if r.book_id in books:
788                     books[r.book_id].merge(r)
789                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
790                 else:
791                     books[r.book_id] = r
792         return books.values()
793
794     def __cmp__(self, other):
795         c = cmp(self.score, other.score)
796         if c == 0:
797             # this is inverted, because earlier date is better
798             return cmp(other.published_date, self.published_date)
799         else:
800             return c
801
802
803 class Hint(object):
804     """
805     Given some hint information (information we already know about)
806     our search target - like author, title (specific book), epoch, genre, kind
807     we can narrow down search using filters.
808     """
809     def __init__(self, search):
810         """
811         Accepts a Searcher instance.
812         """
813         self.search = search
814         self.book_tags = {}
815         self.part_tags = []
816         self._books = []
817
818     def books(self, *books):
819         """
820         Give a hint that we search these books.
821         """
822         self._books = books
823
824     def tags(self, tags):
825         """
826         Give a hint that these Tag objects (a list of)
827         is necessary.
828         """
829         for t in tags:
830             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
831                 lst = self.book_tags.get(t.category, [])
832                 lst.append(t)
833                 self.book_tags[t.category] = lst
834             if t.category in ['theme', 'theme_pl']:
835                 self.part_tags.append(t)
836
837     def tag_filter(self, tags, field='tags'):
838         """
839         Given a lsit of tags and an optional field (but they are normally in tags field)
840         returns a filter accepting only books with specific tags.
841         """
842         q = BooleanQuery()
843
844         for tag in tags:
845             toks = self.search.get_tokens(tag.name, field=field)
846             tag_phrase = PhraseQuery()
847             for tok in toks:
848                 tag_phrase.add(Term(field, tok))
849             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
850
851         return QueryWrapperFilter(q)
852
853     def book_filter(self):
854         """
855         Filters using book tags (all tag kinds except a theme)
856         """
857         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
858         if tags:
859             return self.tag_filter(tags)
860         else:
861             return None
862
863     def part_filter(self):
864         """
865         This filter can be used to look for book parts.
866         It filters on book id and/or themes.
867         """
868         fs = []
869         if self.part_tags:
870             fs.append(self.tag_filter(self.part_tags, field='themes'))
871
872         if self._books != []:
873             bf = BooleanFilter()
874             for b in self._books:
875                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
876                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
877             fs.append(bf)
878
879         return Search.chain_filters(fs)
880
881     def should_search_for_book(self):
882         return self._books == []
883
884     def just_search_in(self, all):
885         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
886         some = []
887         for field in all:
888             if field == 'authors' and 'author' in self.book_tags:
889                 continue
890             if field == 'title' and self._books != []:
891                 continue
892             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
893                 continue
894             some.append(field)
895         return some
896
897
898 class Search(IndexStore):
899     """
900     Search facilities.
901     """
902     def __init__(self, default_field="content"):
903         IndexStore.__init__(self)
904         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
905         # self.analyzer = WLAnalyzer()
906         self.searcher = IndexSearcher(self.store, True)
907         self.parser = QueryParser(Version.LUCENE_34, default_field,
908                                   self.analyzer)
909
910         self.parent_filter = TermsFilter()
911         self.parent_filter.addTerm(Term("is_book", "true"))
912
913     def query(self, query):
914         """Parse query in default Lucene Syntax. (for humans)
915         """
916         return self.parser.parse(query)
917
918     def simple_search(self, query, max_results=50):
919         """Runs a query for books using lucene syntax. (for humans)
920         Returns (books, total_hits)
921         """
922
923         tops = self.searcher.search(self.query(query), max_results)
924         bks = []
925         for found in tops.scoreDocs:
926             doc = self.searcher.doc(found.doc)
927             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
928         return (bks, tops.totalHits)
929
930     def get_tokens(self, searched, field='content', cached=None):
931         """returns tokens analyzed by a proper (for a field) analyzer
932         argument can be: StringReader, string/unicode, or tokens. In the last case
933         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
934         """
935         if cached is not None and field in cached:
936             return cached[field]
937
938         if isinstance(searched, str) or isinstance(searched, unicode):
939             searched = StringReader(searched)
940         elif isinstance(searched, list):
941             return searched
942
943         searched.reset()
944         tokens = self.analyzer.reusableTokenStream(field, searched)
945         toks = []
946         while tokens.incrementToken():
947             cta = tokens.getAttribute(CharTermAttribute.class_)
948             toks.append(cta.toString())
949
950         if cached is not None:
951             cached[field] = toks
952
953         return toks
954
955     def fuzziness(self, fuzzy):
956         """Helper method to sanitize fuzziness"""
957         if not fuzzy:
958             return None
959         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
960             return fuzzy
961         else:
962             return 0.5
963
964     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
965         """
966         Return a PhraseQuery with a series of tokens.
967         """
968         if fuzzy:
969             phrase = MultiPhraseQuery()
970             for t in tokens:
971                 term = Term(field, t)
972                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
973                 fuzzterms = []
974
975                 while True:
976                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
977                     ft = fuzzterm.term()
978                     if ft:
979                         fuzzterms.append(ft)
980                     if not fuzzterm.next(): break
981                 if fuzzterms:
982                     phrase.add(JArray('object')(fuzzterms, Term))
983                 else:
984                     phrase.add(term)
985         else:
986             phrase = PhraseQuery()
987             phrase.setSlop(slop)
988             for t in tokens:
989                 term = Term(field, t)
990                 phrase.add(term)
991         return phrase
992
993     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
994         """
995         Returns term queries joined by boolean query.
996         modal - applies to boolean query
997         fuzzy - should the query by fuzzy.
998         """
999         q = BooleanQuery()
1000         for t in tokens:
1001             term = Term(field, t)
1002             if fuzzy:
1003                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1004             else:
1005                 term = TermQuery(term)
1006             q.add(BooleanClause(term, modal))
1007         return q
1008
1009     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011         if filters is None: filters = []
1012         if tokens_cache is None: tokens_cache = {}
1013
1014         tokens = self.get_tokens(searched, field, cached=tokens_cache)
1015
1016         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1017         if book:
1018             filters.append(self.term_filter(Term('is_book', 'true')))
1019         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1020
1021         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1022
1023     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024                     filters=None, tokens_cache=None, boost=None, snippets=True):
1025         if filters is None: filters = []
1026         if tokens_cache is None: tokens_cache = {}
1027
1028         if book:
1029             filters.append(self.term_filter(Term('is_book', 'true')))
1030
1031         query = BooleanQuery()
1032
1033         for fld in fields:
1034             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1035
1036             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1038
1039         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1040
1041         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1043
1044     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1045         """
1046         Search for perfect book matches. Just see if the query matches with some author or title,
1047         taking hints into account.
1048         """
1049         fields_to_search = ['authors', 'title']
1050         only_in = None
1051         if hint:
1052             if not hint.should_search_for_book():
1053                 return []
1054             fields_to_search = hint.just_search_in(fields_to_search)
1055             only_in = hint.book_filter()
1056
1057         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1058
1059         books = []
1060         for q in qrys:
1061             top = self.searcher.search(q,
1062                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1063                 max_results)
1064             for found in top.scoreDocs:
1065                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1066         return books
1067
1068     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069         fields_to_search = ['tags', 'authors', 'title']
1070
1071         only_in = None
1072         if hint:
1073             if not hint.should_search_for_book():
1074                 return []
1075             fields_to_search = hint.just_search_in(fields_to_search)
1076             only_in = hint.book_filter()
1077
1078         tokens = self.get_tokens(searched, field='SIMPLE')
1079
1080         q = BooleanQuery()
1081
1082         for fld in fields_to_search:
1083             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1085
1086         books = []
1087         top = self.searcher.search(q,
1088                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1089             max_results)
1090         for found in top.scoreDocs:
1091             books.append(SearchResult(self, found, how_found="search_book"))
1092
1093         return books
1094
1095     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1096         """
1097         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098         some part/fragment of the book.
1099         """
1100         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1101
1102         flt = None
1103         if hint:
1104             flt = hint.part_filter()
1105
1106         books = []
1107         for q in qrys:
1108             top = self.searcher.search(q,
1109                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1110                                                            flt]),
1111                                        max_results)
1112             for found in top.scoreDocs:
1113                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1114
1115         return books
1116
1117     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1118         """
1119         Tries to use search terms to match different fields of book (or its parts).
1120         E.g. one word can be an author survey, another be a part of the title, and the rest
1121         are some words from third chapter.
1122         """
1123         if tokens_cache is None: tokens_cache = {}
1124         books = []
1125         only_in = None
1126
1127         if hint:
1128             only_in = hint.part_filter()
1129
1130         # content only query : themes x content
1131         q = BooleanQuery()
1132
1133         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1135
1136         # only search in themes when we do not already filter by themes
1137         if hint is None or hint.just_search_in(['themes']) != []:
1138             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1140
1141         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1143
1144         topDocs = self.searcher.search(q, only_in, max_results)
1145         for found in topDocs.scoreDocs:
1146             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1148
1149         # query themes/content x author/title/tags
1150         q = BooleanQuery()
1151         in_content = BooleanQuery()
1152         in_meta = BooleanQuery()
1153
1154         for fld in ['themes_pl', 'content']:
1155             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1156
1157         for fld in ['tags', 'authors', 'title']:
1158             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1159
1160         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1162
1163         topDocs = self.searcher.search(q, only_in, max_results)
1164         for found in topDocs.scoreDocs:
1165             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1167
1168         return books
1169
1170     # def multisearch(self, query, max_results=50):
1171     #     """
1172     #     Search strategy:
1173     #     - (phrase) OR -> content
1174     #                   -> title
1175     #                   -> authors
1176     #     - (keywords)  -> authors
1177     #                   -> motyw
1178     #                   -> tags
1179     #                   -> content
1180     #     """
1181         # queryreader = StringReader(query)
1182         # tokens = self.get_tokens(queryreader)
1183
1184         # top_level = BooleanQuery()
1185         # Should = BooleanClause.Occur.SHOULD
1186
1187         # phrase_level = BooleanQuery()
1188         # phrase_level.setBoost(1.3)
1189
1190         # p_content = self.make_phrase(tokens, joined=True)
1191         # p_title = self.make_phrase(tokens, 'title')
1192         # p_author = self.make_phrase(tokens, 'author')
1193
1194         # phrase_level.add(BooleanClause(p_content, Should))
1195         # phrase_level.add(BooleanClause(p_title, Should))
1196         # phrase_level.add(BooleanClause(p_author, Should))
1197
1198         # kw_level = BooleanQuery()
1199
1200         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202         # kw_level.add(j_themes, Should)
1203         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204         # j_con = self.make_term_query(tokens, joined=True)
1205         # kw_level.add(j_con, Should)
1206
1207         # top_level.add(BooleanClause(phrase_level, Should))
1208         # top_level.add(BooleanClause(kw_level, Should))
1209
1210         # return None
1211
1212     def get_snippets(self, scoreDoc, query, field='content'):
1213         """
1214         Returns a snippet for found scoreDoc.
1215         """
1216         htmlFormatter = SimpleHTMLFormatter()
1217         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1218
1219         stored = self.searcher.doc(scoreDoc.doc)
1220
1221         position = stored.get('snippets_position')
1222         length = stored.get('snippets_length')
1223         if position is None or length is None:
1224             return None
1225         # locate content.
1226         book_id = int(stored.get('book_id'))
1227         snippets = Snippets(book_id).open()
1228         try:
1229             try:
1230                 text = snippets.get((int(position),
1231                                      int(length)))
1232             finally:
1233                 snippets.close()
1234
1235             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1238
1239         except Exception, e:
1240             e2 = e
1241             if hasattr(e, 'getJavaException'):
1242                 e2 = unicode(e.getJavaException())
1243             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1244                 e2)
1245         return snip
1246
1247     @staticmethod
1248     def enum_to_array(enum):
1249         """
1250         Converts a lucene TermEnum to array of Terms, suitable for
1251         addition to queries
1252         """
1253         terms = []
1254
1255         while True:
1256             t = enum.term()
1257             if t:
1258                 terms.append(t)
1259             if not enum.next(): break
1260
1261         if terms:
1262             return JArray('object')(terms, Term)
1263
1264     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1265         """
1266         Search for Tag objects using query.
1267         """
1268         if not pdcounter:
1269             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270         tops = self.searcher.search(query, filters, max_results)
1271
1272         tags = []
1273         for found in tops.scoreDocs:
1274             doc = self.searcher.doc(found.doc)
1275             is_pdcounter = doc.get('is_pdcounter')
1276             category = doc.get('tag_category')
1277             if is_pdcounter == 'true':
1278                 if category == 'pd_author':
1279                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280                 elif category == 'pd_book':
1281                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282                     tag.category = 'pd_book'  # make it look more lik a tag.
1283                 else:
1284                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1285             else:
1286                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287                 # don't add the pdcounter tag if same tag already exists
1288             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1289                 tags.append(tag)
1290                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291         print 'returning %s' % tags
1292         return tags
1293
1294     def search_books(self, query, filter=None, max_results=10):
1295         """
1296         Searches for Book objects using query
1297         """
1298         bks = []
1299         tops = self.searcher.search(query, filter, max_results)
1300         for found in tops.scoreDocs:
1301             doc = self.searcher.doc(found.doc)
1302             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1303         return bks
1304
1305     def make_prefix_phrase(self, toks, field):
1306         q = MultiPhraseQuery()
1307         for i in range(len(toks)):
1308             t = Term(field, toks[i])
1309             if i == len(toks) - 1:
1310                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1311                 if pterms:
1312                     q.add(pterms)
1313                 else:
1314                     q.add(t)
1315             else:
1316                 q.add(t)
1317         return q
1318
1319     @staticmethod
1320     def term_filter(term, inverse=False):
1321         only_term = TermsFilter()
1322         only_term.addTerm(term)
1323
1324         if inverse:
1325             neg = BooleanFilter()
1326             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1327             only_term = neg
1328
1329         return only_term
1330
1331     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1332         """
1333         Return auto-complete hints for tags
1334         using prefix search.
1335         """
1336         toks = self.get_tokens(string, field='SIMPLE')
1337         top = BooleanQuery()
1338
1339         for field in ['tag_name', 'tag_name_pl']:
1340             if prefix:
1341                 q = self.make_prefix_phrase(toks, field)
1342             else:
1343                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1345
1346         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1347
1348         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1349
1350     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1351         """
1352         Returns auto-complete hints for book titles
1353         Because we do not index 'pseudo' title-tags.
1354         Prefix search.
1355         """
1356         toks = self.get_tokens(string, field='SIMPLE')
1357
1358         if prefix:
1359             q = self.make_prefix_phrase(toks, 'title')
1360         else:
1361             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1362
1363         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1364
1365     @staticmethod
1366     def chain_filters(filters, op=ChainedFilter.AND):
1367         """
1368         Chains a filter list together
1369         """
1370         filters = filter(lambda x: x is not None, filters)
1371         if not filters or filters is []:
1372             return None
1373         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1374         return chf
1375
1376     def filtered_categories(self, tags):
1377         """
1378         Return a list of tag categories, present in tags list.
1379         """
1380         cats = {}
1381         for t in tags:
1382             cats[t.category] = True
1383         return cats.keys()
1384
1385     def hint(self):
1386         return Hint(self)