ignore comments in xml when indexing; make-xml-zip script
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from multiprocessing.pool import ThreadPool
31 from threading import current_thread
32 import atexit
33 import traceback
34
35
36 class WLAnalyzer(PerFieldAnalyzerWrapper):
37     def __init__(self):
38         polish = PolishAnalyzer(Version.LUCENE_34)
39         #        polish_gap.setPositionIncrementGap(999)
40
41         simple = SimpleAnalyzer(Version.LUCENE_34)
42         #        simple_gap.setPositionIncrementGap(999)
43
44         keyword = KeywordAnalyzer(Version.LUCENE_34)
45
46         # not sure if needed: there's NOT_ANALYZED meaning basically the same
47
48         PerFieldAnalyzerWrapper.__init__(self, polish)
49
50         self.addAnalyzer("tags", simple)
51         self.addAnalyzer("technical_editors", simple)
52         self.addAnalyzer("editors", simple)
53         self.addAnalyzer("url", keyword)
54         self.addAnalyzer("source_url", keyword)
55         self.addAnalyzer("source_name", simple)
56         self.addAnalyzer("publisher", simple)
57         self.addAnalyzer("authors", simple)
58         self.addAnalyzer("title", simple)
59
60         self.addAnalyzer("is_book", keyword)
61         # shouldn't the title have two forms? _pl and simple?
62
63         self.addAnalyzer("themes", simple)
64         self.addAnalyzer("themes_pl", polish)
65
66         self.addAnalyzer("tag_name", simple)
67         self.addAnalyzer("tag_name_pl", polish)
68
69         self.addAnalyzer("translators", simple)
70
71         self.addAnalyzer("KEYWORD", keyword)
72         self.addAnalyzer("SIMPLE", simple)
73         self.addAnalyzer("POLISH", polish)
74
75
76 class IndexStore(object):
77     """
78     Provides access to search index.
79
80     self.store - lucene index directory
81     """
82     def __init__(self):
83         self.make_index_dir()
84         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
85
86     def make_index_dir(self):
87         try:
88             os.makedirs(settings.SEARCH_INDEX)
89         except OSError as exc:
90             if exc.errno == errno.EEXIST:
91                 pass
92             else: raise
93
94
95 class IndexChecker(IndexStore):
96     def __init__(self):
97         IndexStore.__init__(self)
98
99     def check(self):
100         checker = CheckIndex(self.store)
101         status = checker.checkIndex()
102         return status
103
104
105 class Snippets(object):
106     """
107     This class manages snippet files for indexed object (book)
108     the snippets are concatenated together, and their positions and
109     lengths are kept in lucene index fields.
110     """
111     SNIPPET_DIR = "snippets"
112
113     def __init__(self, book_id):
114         try:
115             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
116         except OSError as exc:
117             if exc.errno == errno.EEXIST:
118                 pass
119             else: raise
120         self.book_id = book_id
121         self.file = None
122
123     def open(self, mode='r'):
124         """
125         Open the snippet file. Call .close() afterwards.
126         """
127         if not 'b' in mode:
128             mode += 'b'
129         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130         self.position = 0
131         return self
132
133     def add(self, snippet):
134         """
135         Append a snippet (unicode) to the snippet file.
136         Return a (position, length) tuple
137         """
138         txt = snippet.encode('utf-8')
139         l = len(txt)
140         self.file.write(txt)
141         pos = (self.position, l)
142         self.position += l
143         return pos
144
145     def get(self, pos):
146         """
147         Given a tuple of (position, length) return an unicode
148         of the snippet stored there.
149         """
150         self.file.seek(pos[0], 0)
151         txt = self.file.read(pos[1]).decode('utf-8')
152         return txt
153
154     def close(self):
155         """Close snippet file"""
156         self.file.close()
157
158
159 class BaseIndex(IndexStore):
160     """
161     Base index class.
162     Provides basic operations on index: opening, closing, optimizing.
163     """
164     def __init__(self, analyzer=None):
165         super(BaseIndex, self).__init__()
166         self.index = None
167         if not analyzer:
168             analyzer = WLAnalyzer()
169         self.analyzer = analyzer
170
171     def open(self, analyzer=None):
172         if self.index:
173             raise Exception("Index is already opened")
174         self.index = IndexWriter(self.store, self.analyzer,\
175                                  IndexWriter.MaxFieldLength.LIMITED)
176         return self.index
177
178     def optimize(self):
179         self.index.optimize()
180
181     def close(self):
182         try:
183             self.index.optimize()
184         except JavaError, je:
185             print "Error during optimize phase, check index: %s" % je
186
187         self.index.close()
188         self.index = None
189
190     def __enter__(self):
191         self.open()
192         return self
193
194     def __exit__(self, type, value, tb):
195         self.close()
196
197
198 class Index(BaseIndex):
199     """
200     Class indexing books.
201     """
202     def __init__(self, analyzer=None):
203         super(Index, self).__init__(analyzer)
204
205     def index_tags(self):
206         """
207         Re-index global tag list.
208         Removes all tags from index, then index them again.
209         Indexed fields include: id, name (with and without polish stems), category
210         """
211         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
212         self.index.deleteDocuments(q)
213
214         for tag in catalogue.models.Tag.objects.all():
215             doc = Document()
216             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
217             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
218             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
220             self.index.addDocument(doc)
221
222     def create_book_doc(self, book):
223         """
224         Create a lucene document referring book id.
225         """
226         doc = Document()
227         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
228         if book.parent is not None:
229             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
230         return doc
231
232     def remove_book(self, book):
233         """Removes a book from search index.
234         book - Book instance."""
235         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
236         self.index.deleteDocuments(q)
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             self.remove_book(book)
246
247         book_doc = self.create_book_doc(book)
248         meta_fields = self.extract_metadata(book, book_info)
249         for f in meta_fields.values():
250             if isinstance(f, list) or isinstance(f, tuple):
251                 for elem in f:
252                     book_doc.add(elem)
253             else:
254                 book_doc.add(f)
255
256         self.index.addDocument(book_doc)
257         del book_doc
258
259         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
260
261     master_tags = [
262         'opowiadanie',
263         'powiesc',
264         'dramat_wierszowany_l',
265         'dramat_wierszowany_lp',
266         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267         'wywiad'
268         ]
269
270     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
271
272     def extract_metadata(self, book, book_info=None):
273         """
274         Extract metadata from book and returns a map of fields keyed by fieldname
275         """
276         fields = {}
277
278         if book_info is None:
279             book_info = dcparser.parse(open(book.xml_file.path))
280
281         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
282         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
283         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
284
285         # validator, name
286         for field in dcparser.BookInfo.FIELDS:
287             if hasattr(book_info, field.name):
288                 if not getattr(book_info, field.name):
289                     continue
290                 # since no type information is available, we use validator
291                 type_indicator = field.validator
292                 if type_indicator == dcparser.as_unicode:
293                     s = getattr(book_info, field.name)
294                     if field.multiple:
295                         s = ', '.join(s)
296                     try:
297                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
298                     except JavaError as je:
299                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
300                 elif type_indicator == dcparser.as_person:
301                     p = getattr(book_info, field.name)
302                     if isinstance(p, dcparser.Person):
303                         persons = unicode(p)
304                     else:
305                         persons = ', '.join(map(unicode, p))
306                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
307                 elif type_indicator == dcparser.as_date:
308                     dt = getattr(book_info, field.name)
309                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
310                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
311
312         return fields
313
314     def add_gaps(self, fields, fieldname):
315         """
316         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
317         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
318         """
319         def gap():
320             while True:
321                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
322         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
323
324     def get_master(self, root):
325         """
326         Returns the first master tag from an etree.
327         """
328         for master in root.iter():
329             if master.tag in self.master_tags:
330                 return master
331
332     def index_content(self, book, book_fields=[]):
333         """
334         Walks the book XML and extract content from it.
335         Adds parts for each header tag and for each fragment.
336         """
337         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
338         root = wld.edoc.getroot()
339
340         master = self.get_master(root)
341         if master is None:
342             return []
343
344         def walker(node):
345             yield node, None
346             for child in list(node):
347                 for b, e in walker(child):
348                     yield b, e
349             yield None, node
350             return
351
352         def fix_format(text):
353             return re.sub("(?m)/$", "", text)
354
355         def add_part(snippets, **fields):
356             doc = self.create_book_doc(book)
357             for f in book_fields:
358                 doc.add(f)
359
360             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
361             doc.add(NumericField("header_span", Field.Store.YES, True)\
362                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
363             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
364
365             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
366                           Field.TermVector.WITH_POSITIONS_OFFSETS))
367
368             snip_pos = snippets.add(fields["content"])
369             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
370             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
371
372             if 'fragment_anchor' in fields:
373                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
374                               Field.Store.YES, Field.Index.NOT_ANALYZED))
375
376             if 'themes' in fields:
377                 themes, themes_pl = zip(*[
378                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
379                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
380                      for theme in fields['themes']])
381
382                 themes = self.add_gaps(themes, 'themes')
383                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
384
385                 for t in themes:
386                     doc.add(t)
387                 for t in themes_pl:
388                     doc.add(t)
389
390             return doc
391
392         def give_me_utf8(s):
393             if isinstance(s, unicode):
394                 return s.encode('utf-8')
395             else:
396                 return s
397
398         fragments = {}
399         snippets = Snippets(book.id).open('w')
400         try:
401             for header, position in zip(list(master), range(len(master))):
402
403                 if header.tag in self.skip_header_tags:
404                     continue
405                 if header.tag is etree.Comment:
406                     continue
407
408                 # section content
409                 content = []
410
411                 for start, end in walker(header):
412                         # handle fragments and themes.
413                     if start is not None and start.tag == 'begin':
414                         fid = start.attrib['id'][1:]
415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
416
417                     elif start is not None and start.tag == 'motyw':
418                         fid = start.attrib['id'][1:]
419                         if start.text is not None:
420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
421
422                     elif start is not None and start.tag == 'end':
423                         fid = start.attrib['id'][1:]
424                         if fid not in fragments:
425                             continue  # a broken <end> node, skip it
426                                       #                        import pdb; pdb.set_trace()
427                         frag = fragments[fid]
428                         if frag['themes'] == []:
429                             continue  # empty themes list.
430                         del fragments[fid]
431
432                         def jstr(l):
433                             return u' '.join(map(
434                                 lambda x: x == None and u'(none)' or unicode(x),
435                                 l))
436
437                         doc = add_part(snippets,
438                                        header_type=frag['start_header'],
439                                        header_index=frag['start_section'],
440                                        header_span=position - frag['start_section'] + 1,
441                                        fragment_anchor=fid,
442                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
443                                        themes=frag['themes'])
444
445                         self.index.addDocument(doc)
446
447                         # Collect content.
448                     elif start is not None:
449                         for frag in fragments.values():
450                             frag['content'].append(start.text)
451                         content.append(start.text)
452                     elif end is not None:
453                         for frag in fragments.values():
454                             frag['content'].append(end.tail)
455                         content.append(end.tail)
456
457                         # in the end, add a section text.
458                 doc = add_part(snippets, header_index=position, header_type=header.tag,
459                                content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
460
461                 self.index.addDocument(doc)
462
463         finally:
464             snippets.close()
465
466
467 def log_exception_wrapper(f):
468     def _wrap(*a):
469         try:
470             f(*a)
471         except Exception, e:
472             print("Error in indexing thread: %s" % e)
473             traceback.print_exc()
474             raise e
475     return _wrap
476
477
478 class ReusableIndex(Index):
479     """
480     Works like index, but does not close/optimize Lucene index
481     until program exit (uses atexit hook).
482     This is usefull for importbooks command.
483
484     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
485     """
486     index = None
487
488     def open(self, analyzer=None, threads=4):
489         if ReusableIndex.index is not None:
490             self.index = ReusableIndex.index
491         else:
492             print("opening index")
493             Index.open(self, analyzer)
494             ReusableIndex.index = self.index
495             atexit.register(ReusableIndex.close_reusable)
496
497     # def index_book(self, *args, **kw):
498     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
499     #     ReusableIndex.pool_jobs.append(job)
500
501     @staticmethod
502     def close_reusable():
503         if ReusableIndex.index is not None:
504             ReusableIndex.index.optimize()
505             ReusableIndex.index.close()
506             ReusableIndex.index = None
507
508     def close(self):
509         pass
510
511
512 class JoinSearch(object):
513     """
514     This mixin could be used to handle block join queries.
515     (currently unused)
516     """
517     def __init__(self, *args, **kw):
518         super(JoinSearch, self).__init__(*args, **kw)
519
520     def wrapjoins(self, query, fields=[]):
521         """
522         This functions modifies the query in a recursive way,
523         so Term and Phrase Queries contained, which match
524         provided fields are wrapped in a BlockJoinQuery,
525         and so delegated to children documents.
526         """
527         if BooleanQuery.instance_(query):
528             qs = BooleanQuery.cast_(query)
529             for clause in qs:
530                 clause = BooleanClause.cast_(clause)
531                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
532             return qs
533         else:
534             termset = HashSet()
535             query.extractTerms(termset)
536             for t in termset:
537                 t = Term.cast_(t)
538                 if t.field() not in fields:
539                     return query
540             return BlockJoinQuery(query, self.parent_filter,
541                                   BlockJoinQuery.ScoreMode.Total)
542
543     def bsearch(self, query, max_results=50):
544         q = self.query(query)
545         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
546
547         tops = self.searcher.search(bjq, max_results)
548         bks = []
549         for found in tops.scoreDocs:
550             doc = self.searcher.doc(found.doc)
551             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
552         return (bks, tops.totalHits)
553
554
555 class SearchResult(object):
556     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
557         if score:
558             self.score = score
559         else:
560             self.score = scoreDocs.score
561
562         self._hits = []
563         self.hits = None  # processed hits
564
565         stored = searcher.doc(scoreDocs.doc)
566         self.book_id = int(stored.get("book_id"))
567
568         header_type = stored.get("header_type")
569         if not header_type:
570             return
571
572         sec = (header_type, int(stored.get("header_index")))
573         header_span = stored.get('header_span')
574         header_span = header_span is not None and int(header_span) or 1
575
576         fragment = stored.get("fragment_anchor")
577
578         if snippets:
579             snippets = snippets.replace("/\n", "\n")
580         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
581
582         self._hits.append(hit)
583
584     def merge(self, other):
585         if self.book_id != other.book_id:
586             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
587         self._hits += other._hits
588         if other.score > self.score:
589             self.score = other.score
590         return self
591
592     def get_book(self):
593         return catalogue.models.Book.objects.get(id=self.book_id)
594
595     book = property(get_book)
596
597     def process_hits(self):
598         POSITION = 0
599         FRAGMENT = 1
600         POSITION_INDEX = 1
601         POSITION_SPAN = 2
602         SCORE = 2
603         OTHER = 3
604
605         # to sections and fragments
606         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
607         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
608         sect = filter(lambda s: 0 == len(filter(
609             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
610             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
611             frags)), sect)
612
613         hits = []
614
615         # remove duplicate fragments
616         fragments = {}
617         for f in frags:
618             fid = f[FRAGMENT]
619             if fid in fragments:
620                 if fragments[fid][SCORE] >= f[SCORE]:
621                     continue
622             fragments[fid] = f
623         frags = fragments.values()
624
625         # remove duplicate sections
626         sections = {}
627
628         for s in sect:
629             si = s[POSITION][POSITION_INDEX]
630             # skip existing
631             if si in sections:
632                 if sections[si]['score'] >= s[SCORE]:
633                     continue
634
635             m = {'score': s[SCORE],
636                  'section_number': s[POSITION][POSITION_INDEX] + 1,
637                  }
638             m.update(s[OTHER])
639             sections[si] = m
640
641         hits = sections.values()
642
643         for f in frags:
644             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
645             m = {'score': f[SCORE],
646                  'fragment': frag,
647                  'section_number': f[POSITION][POSITION_INDEX] + 1,
648                  'themes': frag.tags.filter(category='theme')
649                  }
650             m.update(f[OTHER])
651             hits.append(m)
652
653         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
654
655         self.hits = hits
656
657         return self
658
659     def __unicode__(self):
660         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
661
662     @staticmethod
663     def aggregate(*result_lists):
664         books = {}
665         for rl in result_lists:
666             for r in rl:
667                 if r.book_id in books:
668                     books[r.book_id].merge(r)
669                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
670                 else:
671                     books[r.book_id] = r
672         return books.values()
673
674     def __cmp__(self, other):
675         return cmp(self.score, other.score)
676
677
678 class Hint(object):
679     """
680     Given some hint information (information we already know about)
681     our search target - like author, title (specific book), epoch, genre, kind
682     we can narrow down search using filters.
683     """
684     def __init__(self, search):
685         """
686         Accepts a Searcher instance.
687         """
688         self.search = search
689         self.book_tags = {}
690         self.part_tags = []
691         self._books = []
692
693     def books(self, *books):
694         """
695         Give a hint that we search these books.
696         """
697         self._books = books
698
699     def tags(self, tags):
700         """
701         Give a hint that these Tag objects (a list of)
702         is necessary.
703         """
704         for t in tags:
705             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
706                 lst = self.book_tags.get(t.category, [])
707                 lst.append(t)
708                 self.book_tags[t.category] = lst
709             if t.category in ['theme', 'theme_pl']:
710                 self.part_tags.append(t)
711
712     def tag_filter(self, tags, field='tags'):
713         """
714         Given a lsit of tags and an optional field (but they are normally in tags field)
715         returns a filter accepting only books with specific tags.
716         """
717         q = BooleanQuery()
718
719         for tag in tags:
720             toks = self.search.get_tokens(tag.name, field=field)
721             tag_phrase = PhraseQuery()
722             for tok in toks:
723                 tag_phrase.add(Term(field, tok))
724             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
725
726         return QueryWrapperFilter(q)
727
728     def book_filter(self):
729         """
730         Filters using book tags (all tag kinds except a theme)
731         """
732         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
733         if tags:
734             return self.tag_filter(tags)
735         else:
736             return None
737
738     def part_filter(self):
739         """
740         This filter can be used to look for book parts.
741         It filters on book id and/or themes.
742         """
743         fs = []
744         if self.part_tags:
745             fs.append(self.tag_filter(self.part_tags, field='themes'))
746
747         if self._books != []:
748             bf = BooleanFilter()
749             for b in self._books:
750                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
751                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
752             fs.append(bf)
753
754         return Search.chain_filters(fs)
755
756     def should_search_for_book(self):
757         return self._books == []
758
759     def just_search_in(self, all):
760         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
761         some = []
762         for field in all:
763             if field == 'authors' and 'author' in self.book_tags:
764                 continue
765             if field == 'title' and self._books != []:
766                 continue
767             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
768                 continue
769             some.append(field)
770         return some
771
772
773 class Search(IndexStore):
774     """
775     Search facilities.
776     """
777     def __init__(self, default_field="content"):
778         IndexStore.__init__(self)
779         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
780         # self.analyzer = WLAnalyzer()
781         self.searcher = IndexSearcher(self.store, True)
782         self.parser = QueryParser(Version.LUCENE_34, default_field,
783                                   self.analyzer)
784
785         self.parent_filter = TermsFilter()
786         self.parent_filter.addTerm(Term("is_book", "true"))
787
788     def query(self, query):
789         """Parse query in default Lucene Syntax. (for humans)
790         """
791         return self.parser.parse(query)
792
793     def simple_search(self, query, max_results=50):
794         """Runs a query for books using lucene syntax. (for humans)
795         Returns (books, total_hits)
796         """
797
798         tops = self.searcher.search(self.query(query), max_results)
799         bks = []
800         for found in tops.scoreDocs:
801             doc = self.searcher.doc(found.doc)
802             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
803         return (bks, tops.totalHits)
804
805     def get_tokens(self, searched, field='content'):
806         """returns tokens analyzed by a proper (for a field) analyzer
807         argument can be: StringReader, string/unicode, or tokens. In the last case
808         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
809         """
810         if isinstance(searched, str) or isinstance(searched, unicode):
811             searched = StringReader(searched)
812         elif isinstance(searched, list):
813             return searched
814
815         searched.reset()
816         tokens = self.analyzer.reusableTokenStream(field, searched)
817         toks = []
818         while tokens.incrementToken():
819             cta = tokens.getAttribute(CharTermAttribute.class_)
820             toks.append(cta.toString())
821         return toks
822
823     def fuzziness(self, fuzzy):
824         """Helper method to sanitize fuzziness"""
825         if not fuzzy:
826             return None
827         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
828             return fuzzy
829         else:
830             return 0.5
831
832     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
833         """
834         Return a PhraseQuery with a series of tokens.
835         """
836         if fuzzy:
837             phrase = MultiPhraseQuery()
838             for t in tokens:
839                 term = Term(field, t)
840                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
841                 fuzzterms = []
842
843                 while True:
844                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
845                     ft = fuzzterm.term()
846                     if ft:
847                         fuzzterms.append(ft)
848                     if not fuzzterm.next(): break
849                 if fuzzterms:
850                     phrase.add(JArray('object')(fuzzterms, Term))
851                 else:
852                     phrase.add(term)
853         else:
854             phrase = PhraseQuery()
855             phrase.setSlop(slop)
856             for t in tokens:
857                 term = Term(field, t)
858                 phrase.add(term)
859         return phrase
860
861     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
862         """
863         Returns term queries joined by boolean query.
864         modal - applies to boolean query
865         fuzzy - should the query by fuzzy.
866         """
867         q = BooleanQuery()
868         for t in tokens:
869             term = Term(field, t)
870             if fuzzy:
871                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
872             else:
873                 term = TermQuery(term)
874             q.add(BooleanClause(term, modal))
875         return q
876
877     # def content_query(self, query):
878     #     return BlockJoinQuery(query, self.parent_filter,
879     #                           BlockJoinQuery.ScoreMode.Total)
880
881     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
882         """
883         Search for perfect book matches. Just see if the query matches with some author or title,
884         taking hints into account.
885         """
886         fields_to_search = ['authors', 'title']
887         only_in = None
888         if hint:
889             if not hint.should_search_for_book():
890                 return []
891             fields_to_search = hint.just_search_in(fields_to_search)
892             only_in = hint.book_filter()
893
894         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
895
896         books = []
897         for q in qrys:
898             top = self.searcher.search(q,
899                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
900                 max_results)
901             for found in top.scoreDocs:
902                 books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
903         return books
904
905     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
906         fields_to_search = ['tags', 'authors', 'title']
907
908         only_in = None
909         if hint:
910             if not hint.should_search_for_book():
911                 return []
912             fields_to_search = hint.just_search_in(fields_to_search)
913             only_in = hint.book_filter()
914
915         tokens = self.get_tokens(searched, field='SIMPLE')
916
917         q = BooleanQuery()
918
919         for fld in fields_to_search:
920             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
921                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
922
923         books = []
924         top = self.searcher.search(q,
925                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
926             max_results)
927         for found in top.scoreDocs:
928             books.append(SearchResult(self.searcher, found, how_found="search_book"))
929
930         return books
931
932     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
933         """
934         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
935         some part/fragment of the book.
936         """
937         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
938
939         flt = None
940         if hint:
941             flt = hint.part_filter()
942
943         books = []
944         for q in qrys:
945             top = self.searcher.search(q,
946                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
947                                                            flt]),
948                                        max_results)
949             for found in top.scoreDocs:
950                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
951
952         return books
953
954     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
955         """
956         Tries to use search terms to match different fields of book (or its parts).
957         E.g. one word can be an author survey, another be a part of the title, and the rest
958         are some words from third chapter.
959         """
960         books = []
961         only_in = None
962
963         if hint:
964             only_in = hint.part_filter()
965
966         # content only query : themes x content
967         q = BooleanQuery()
968
969         tokens_pl = self.get_tokens(searched, field='content')
970         tokens = self.get_tokens(searched, field='SIMPLE')
971
972         # only search in themes when we do not already filter by themes
973         if hint is None or hint.just_search_in(['themes']) != []:
974             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
975                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
976
977         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
978                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
979
980         topDocs = self.searcher.search(q, only_in, max_results)
981         for found in topDocs.scoreDocs:
982             books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
983             print "* %s theme x content: %s" % (searched, books[-1]._hits)
984
985         # query themes/content x author/title/tags
986         q = BooleanQuery()
987         in_content = BooleanQuery()
988         in_meta = BooleanQuery()
989
990         for fld in ['themes_pl', 'content']:
991             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
992
993         for fld in ['tags', 'authors', 'title']:
994             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
995
996         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
997         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
998
999         topDocs = self.searcher.search(q, only_in, max_results)
1000         for found in topDocs.scoreDocs:
1001             books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
1002             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1003
1004         return books
1005
1006     # def multisearch(self, query, max_results=50):
1007     #     """
1008     #     Search strategy:
1009     #     - (phrase) OR -> content
1010     #                   -> title
1011     #                   -> authors
1012     #     - (keywords)  -> authors
1013     #                   -> motyw
1014     #                   -> tags
1015     #                   -> content
1016     #     """
1017         # queryreader = StringReader(query)
1018         # tokens = self.get_tokens(queryreader)
1019
1020         # top_level = BooleanQuery()
1021         # Should = BooleanClause.Occur.SHOULD
1022
1023         # phrase_level = BooleanQuery()
1024         # phrase_level.setBoost(1.3)
1025
1026         # p_content = self.make_phrase(tokens, joined=True)
1027         # p_title = self.make_phrase(tokens, 'title')
1028         # p_author = self.make_phrase(tokens, 'author')
1029
1030         # phrase_level.add(BooleanClause(p_content, Should))
1031         # phrase_level.add(BooleanClause(p_title, Should))
1032         # phrase_level.add(BooleanClause(p_author, Should))
1033
1034         # kw_level = BooleanQuery()
1035
1036         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1037         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1038         # kw_level.add(j_themes, Should)
1039         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1040         # j_con = self.make_term_query(tokens, joined=True)
1041         # kw_level.add(j_con, Should)
1042
1043         # top_level.add(BooleanClause(phrase_level, Should))
1044         # top_level.add(BooleanClause(kw_level, Should))
1045
1046         # return None
1047
1048     def get_snippets(self, scoreDoc, query, field='content'):
1049         """
1050         Returns a snippet for found scoreDoc.
1051         """
1052         htmlFormatter = SimpleHTMLFormatter()
1053         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1054
1055         stored = self.searcher.doc(scoreDoc.doc)
1056
1057         # locate content.
1058         snippets = Snippets(stored.get('book_id')).open()
1059         try:
1060             text = snippets.get((int(stored.get('snippets_position')),
1061                                  int(stored.get('snippets_length'))))
1062         finally:
1063             snippets.close()
1064
1065         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1066         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1067         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1068
1069         return snip
1070
1071     @staticmethod
1072     def enum_to_array(enum):
1073         """
1074         Converts a lucene TermEnum to array of Terms, suitable for
1075         addition to queries
1076         """
1077         terms = []
1078
1079         while True:
1080             t = enum.term()
1081             if t:
1082                 terms.append(t)
1083             if not enum.next(): break
1084
1085         if terms:
1086             return JArray('object')(terms, Term)
1087
1088     def search_tags(self, query, filter=None, max_results=40):
1089         """
1090         Search for Tag objects using query.
1091         """
1092         tops = self.searcher.search(query, filter, max_results)
1093
1094         tags = []
1095         for found in tops.scoreDocs:
1096             doc = self.searcher.doc(found.doc)
1097             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1098             tags.append(tag)
1099             print "%s (%d) -> %f" % (tag, tag.id, found.score)
1100
1101         return tags
1102
1103     def search_books(self, query, filter=None, max_results=10):
1104         """
1105         Searches for Book objects using query
1106         """
1107         bks = []
1108         tops = self.searcher.search(query, filter, max_results)
1109         for found in tops.scoreDocs:
1110             doc = self.searcher.doc(found.doc)
1111             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1112         return bks
1113
1114     def create_prefix_phrase(self, toks, field):
1115         q = MultiPhraseQuery()
1116         for i in range(len(toks)):
1117             t = Term(field, toks[i])
1118             if i == len(toks) - 1:
1119                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1120                 if pterms:
1121                     q.add(pterms)
1122                 else:
1123                     q.add(t)
1124             else:
1125                 q.add(t)
1126         return q
1127
1128     @staticmethod
1129     def term_filter(term, inverse=False):
1130         only_term = TermsFilter()
1131         only_term.addTerm(term)
1132
1133         if inverse:
1134             neg = BooleanFilter()
1135             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1136             only_term = neg
1137
1138         return only_term
1139
1140     def hint_tags(self, string, max_results=50):
1141         """
1142         Return auto-complete hints for tags
1143         using prefix search.
1144         """
1145         toks = self.get_tokens(string, field='SIMPLE')
1146         top = BooleanQuery()
1147
1148         for field in ['tag_name', 'tag_name_pl']:
1149             q = self.create_prefix_phrase(toks, field)
1150             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1151
1152         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1153
1154         return self.search_tags(top, no_book_cat, max_results=max_results)
1155
1156     def hint_books(self, string, max_results=50):
1157         """
1158         Returns auto-complete hints for book titles
1159         Because we do not index 'pseudo' title-tags.
1160         Prefix search.
1161         """
1162         toks = self.get_tokens(string, field='SIMPLE')
1163
1164         q = self.create_prefix_phrase(toks, 'title')
1165
1166         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1167
1168     @staticmethod
1169     def chain_filters(filters, op=ChainedFilter.AND):
1170         """
1171         Chains a filter list together
1172         """
1173         filters = filter(lambda x: x is not None, filters)
1174         if not filters:
1175             return None
1176         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1177         return chf
1178
1179     def filtered_categories(self, tags):
1180         """
1181         Return a list of tag categories, present in tags list.
1182         """
1183         cats = {}
1184         for t in tags:
1185             cats[t.category] = True
1186         return cats.keys()
1187
1188     def hint(self):
1189         return Hint(self)