Split themes by ,
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("author", simple)
57         self.addAnalyzer("is_book", keyword)
58
59         self.addAnalyzer("themes", simple)
60         self.addAnalyzer("themes_pl", polish)
61
62         self.addAnalyzer("tag_name", simple)
63         self.addAnalyzer("tag_name_pl", polish)
64
65         self.addAnalyzer("KEYWORD", keyword)
66         self.addAnalyzer("SIMPLE", simple)
67         self.addAnalyzer("POLISH", polish)
68
69
70 class IndexStore(object):
71     def __init__(self):
72         self.make_index_dir()
73         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
74
75     def make_index_dir(self):
76         try:
77             os.makedirs(settings.SEARCH_INDEX)
78         except OSError as exc:
79             if exc.errno == errno.EEXIST:
80                 pass
81             else: raise
82
83
84 class IndexChecker(IndexStore):
85     def __init__(self):
86         IndexStore.__init__(self)
87
88     def check(self):
89         checker = CheckIndex(self.store)
90         status = checker.checkIndex()
91         return status
92
93
94 class Snippets(object):
95     SNIPPET_DIR = "snippets"
96
97     def __init__(self, book_id):
98         try:
99             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
100         except OSError as exc:
101             if exc.errno == errno.EEXIST:
102                 pass
103             else: raise
104         self.book_id = book_id
105         self.file = None
106
107     def open(self, mode='r'):
108         if not 'b' in mode:
109             mode += 'b'
110         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
111         self.position = 0
112         return self
113
114     def add(self, snippet):
115         txt = snippet.encode('utf-8')
116         l = len(txt)
117         self.file.write(txt)
118         pos = (self.position, l)
119         self.position += l
120         print "Snip<%s>%s</s>" %(pos, txt)
121         return pos
122
123     def get(self, pos):
124         self.file.seek(pos[0], 0)
125         txt = self.file.read(pos[1]).decode('utf-8')
126         print "got from snippets %d bytes from %s:" % (len(txt), pos)
127         return txt
128
129     def close(self):
130         self.file.close()
131
132
133 class Index(IndexStore):
134     def __init__(self, analyzer=None):
135         IndexStore.__init__(self)
136         self.index = None
137         if not analyzer:
138             analyzer = WLAnalyzer()
139         self.analyzer = analyzer
140
141     def open(self, analyzer=None):
142         if self.index:
143             raise Exception("Index is already opened")
144         self.index = IndexWriter(self.store, self.analyzer,\
145                                  IndexWriter.MaxFieldLength.LIMITED)
146         return self.index
147
148     def optimize(self):
149         self.index.optimize()
150
151     def close(self):
152         try:
153             self.index.optimize()
154         except JavaError, je:
155             print "Error during optimize phase, check index: %s" % je
156
157         self.index.close()
158         self.index = None
159
160     def index_tags(self):
161         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
162         self.index.deleteDocuments(q)
163
164         for tag in catalogue.models.Tag.objects.all():
165             doc = Document()
166             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
167             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
168             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
169             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
170             self.index.addDocument(doc)
171
172     def remove_book(self, book):
173         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
174         self.index.deleteDocuments(q)
175
176     def index_book(self, book, overwrite=True):
177         if overwrite:
178             self.remove_book(book)
179
180         book_doc = self.create_book_doc(book)
181         meta_fields = self.extract_metadata(book)
182         for f in meta_fields.values():
183             if isinstance(f, list) or isinstance(f, tuple):
184                 for elem in f:
185                     book_doc.add(elem)
186             else:
187                 book_doc.add(f)
188
189         self.index.addDocument(book_doc)
190         del book_doc
191
192         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['author']])
193
194     master_tags = [
195         'opowiadanie',
196         'powiesc',
197         'dramat_wierszowany_l',
198         'dramat_wierszowany_lp',
199         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
200         'wywiad'
201         ]
202
203     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
204
205     def create_book_doc(self, book):
206         """
207         Create a lucene document connected to the book
208         """
209         doc = Document()
210         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
211         if book.parent is not None:
212             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
213         return doc
214
215     def extract_metadata(self, book):
216         fields = {}
217         book_info = dcparser.parse(book.xml_file)
218
219         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
220
221         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
222         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
223         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
224
225         # validator, name
226         for field in dcparser.BookInfo.FIELDS:
227             if hasattr(book_info, field.name):
228                 if not getattr(book_info, field.name):
229                     continue
230                 # since no type information is available, we use validator
231                 type_indicator = field.validator
232                 if type_indicator == dcparser.as_unicode:
233                     s = getattr(book_info, field.name)
234                     if field.multiple:
235                         s = ', '.join(s)
236                     try:
237                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
238                     except JavaError as je:
239                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
240                 elif type_indicator == dcparser.as_person:
241                     p = getattr(book_info, field.name)
242                     if isinstance(p, dcparser.Person):
243                         persons = unicode(p)
244                     else:
245                         persons = ', '.join(map(unicode, p))
246                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
247                 elif type_indicator == dcparser.as_date:
248                     dt = getattr(book_info, field.name)
249                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
250                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
251         return fields
252
253     def get_master(self, root):
254         for master in root.iter():
255             if master.tag in self.master_tags:
256                 return master
257
258     def add_gaps(self, fields, fieldname):
259         def gap():
260             while True:
261                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
262         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
263
264     def index_content(self, book, book_fields=[]):
265         wld = WLDocument.from_file(book.xml_file.path)
266         root = wld.edoc.getroot()
267
268         master = self.get_master(root)
269         if master is None:
270             return []
271
272         def walker(node):
273             yield node, None
274             for child in list(node):
275                 for b, e in walker(child):
276                     yield b, e
277             yield None, node
278             return
279
280         def fix_format(text):
281             return re.sub("/$", "", text, flags=re.M)
282
283         def add_part(snippets, **fields):
284             doc = self.create_book_doc(book)
285             for f in book_fields:
286                 doc.add(f)
287
288             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
289             doc.add(NumericField("header_span", Field.Store.YES, True)\
290                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
291             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
292
293             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
294                           Field.TermVector.WITH_POSITIONS_OFFSETS))
295
296             snip_pos = snippets.add(fields["content"])
297             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
298             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
299
300             if 'fragment_anchor' in fields:
301                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
302                               Field.Store.YES, Field.Index.NOT_ANALYZED))
303
304             if 'themes' in fields:
305                 themes, themes_pl = zip(*[
306                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
307                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
308                      for theme in fields['themes']])
309
310                 themes = self.add_gaps(themes, 'themes')
311                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
312
313                 for t in themes:
314                     doc.add(t)
315                 for t in themes_pl:
316                     doc.add(t)
317
318             return doc
319
320         fragments = {}
321         snippets = Snippets(book.id).open('w')
322         try:
323             for header, position in zip(list(master), range(len(master))):
324
325                 if header.tag in self.skip_header_tags:
326                     continue
327
328                 content = u' '.join([t for t in header.itertext()])
329                 content = fix_format(content)
330
331                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
332
333                 self.index.addDocument(doc)
334
335                 for start, end in walker(header):
336                     if start is not None and start.tag == 'begin':
337                         fid = start.attrib['id'][1:]
338                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
339                         fragments[fid]['content'].append(start.tail)
340                     elif start is not None and start.tag == 'motyw':
341                         fid = start.attrib['id'][1:]
342                         if start.text is not None:
343                             fragments[fid]['themes'] += map(unicode.strip, start.text.split(','))
344                         fragments[fid]['content'].append(start.tail)
345                     elif start is not None and start.tag == 'end':
346                         fid = start.attrib['id'][1:]
347                         if fid not in fragments:
348                             continue  # a broken <end> node, skip it
349                         frag = fragments[fid]
350                         if frag['themes'] == []:
351                             continue  # empty themes list.
352                         del fragments[fid]
353
354                         def jstr(l):
355                             return u' '.join(map(
356                                 lambda x: x == None and u'(none)' or unicode(x),
357                                 l))
358
359                         doc = add_part(snippets,
360                                        header_type=frag['start_header'],
361                                        header_index=frag['start_section'],
362                                        header_span=position - frag['start_section'] + 1,
363                                        fragment_anchor=fid,
364                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
365                                        themes=frag['themes'])
366
367                         self.index.addDocument(doc)
368                     elif start is not None:
369                         for frag in fragments.values():
370                             frag['content'].append(start.text)
371                     elif end is not None:
372                         for frag in fragments.values():
373                             frag['content'].append(end.tail)
374         finally:
375             snippets.close()
376
377
378     def __enter__(self):
379         self.open()
380         return self
381
382     def __exit__(self, type, value, tb):
383         self.close()
384
385
386 def log_exception_wrapper(f):
387     def _wrap(*a):
388         try:
389             f(*a)
390         except Exception, e:
391             print("Error in indexing thread: %s" % e)
392             traceback.print_exc()
393             raise e
394     return _wrap
395
396
397 class ReusableIndex(Index):
398     """
399     Works like index, but does not close/optimize Lucene index
400     until program exit (uses atexit hook).
401     This is usefull for importbooks command.
402
403     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
404     """
405     index = None
406     pool = None
407     pool_jobs = None
408
409     def open(self, analyzer=None, threads=4):
410         if ReusableIndex.index is not None:
411             self.index = ReusableIndex.index
412         else:
413             print("opening index")
414             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
415             ReusableIndex.pool_jobs = []
416             Index.open(self, analyzer)
417             ReusableIndex.index = self.index
418             atexit.register(ReusableIndex.close_reusable)
419
420     def index_book(self, *args, **kw):
421         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
422         ReusableIndex.pool_jobs.append(job)
423
424     @staticmethod
425     def close_reusable():
426         if ReusableIndex.index is not None:
427             print("wait for indexing to finish")
428             for job in ReusableIndex.pool_jobs:
429                 job.get()
430                 sys.stdout.write('.')
431                 sys.stdout.flush()
432             print("done.")
433             ReusableIndex.pool.close()
434
435             ReusableIndex.index.optimize()
436             ReusableIndex.index.close()
437             ReusableIndex.index = None
438
439     def close(self):
440         pass
441
442
443 class Search(IndexStore):
444     def __init__(self, default_field="content"):
445         IndexStore.__init__(self)
446         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
447         ## self.analyzer = WLAnalyzer()
448         self.searcher = IndexSearcher(self.store, True)
449         self.parser = QueryParser(Version.LUCENE_34, default_field,
450                                   self.analyzer)
451
452         self.parent_filter = TermsFilter()
453         self.parent_filter.addTerm(Term("is_book", "true"))
454
455     def query(self, query):
456         return self.parser.parse(query)
457
458     def wrapjoins(self, query, fields=[]):
459         """
460         This functions modifies the query in a recursive way,
461         so Term and Phrase Queries contained, which match
462         provided fields are wrapped in a BlockJoinQuery,
463         and so delegated to children documents.
464         """
465         if BooleanQuery.instance_(query):
466             qs = BooleanQuery.cast_(query)
467             for clause in qs:
468                 clause = BooleanClause.cast_(clause)
469                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
470             return qs
471         else:
472             termset = HashSet()
473             query.extractTerms(termset)
474             for t in termset:
475                 t = Term.cast_(t)
476                 if t.field() not in fields:
477                     return query
478             return BlockJoinQuery(query, self.parent_filter,
479                                   BlockJoinQuery.ScoreMode.Total)
480
481     def simple_search(self, query, max_results=50):
482         """Returns (books, total_hits)
483         """
484
485         tops = self.searcher.search(self.query(query), max_results)
486         bks = []
487         for found in tops.scoreDocs:
488             doc = self.searcher.doc(found.doc)
489             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
490         return (bks, tops.totalHits)
491
492     def search(self, query, max_results=50):
493         query = self.query(query)
494         query = self.wrapjoins(query, ["content", "themes"])
495
496         tops = self.searcher.search(query, max_results)
497         bks = []
498         for found in tops.scoreDocs:
499             doc = self.searcher.doc(found.doc)
500             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
501         return (bks, tops.totalHits)
502
503     def bsearch(self, query, max_results=50):
504         q = self.query(query)
505         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
506
507         tops = self.searcher.search(bjq, max_results)
508         bks = []
509         for found in tops.scoreDocs:
510             doc = self.searcher.doc(found.doc)
511             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
512         return (bks, tops.totalHits)
513
514 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
515 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
516 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
517
518 # while (tokenStream.incrementToken()) {
519 #     int startOffset = offsetAttribute.startOffset();
520 #     int endOffset = offsetAttribute.endOffset();
521 #     String term = charTermAttribute.toString();
522 # }
523
524
525 class SearchResult(object):
526     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
527         self.snippets = []
528
529         if score:
530             self.score = score
531         else:
532             self.score = scoreDocs.score
533
534         self.hits = []
535
536         stored = searcher.doc(scoreDocs.doc)
537         self.book_id = int(stored.get("book_id"))
538
539         header_type = stored.get("header_type")
540         if not header_type:
541             return
542
543         sec = (header_type, int(stored.get("header_index")))
544         header_span = stored.get('header_span')
545         header_span = header_span is not None and int(header_span) or 1
546
547         fragment = stored.get("fragment_anchor")
548
549         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
550
551         self.hits.append(hit)
552
553     def merge(self, other):
554         if self.book_id != other.book_id:
555             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
556         self.hits += other.hits
557         if other.score > self.score:
558             self.score = other.score
559         return self
560
561     def get_book(self):
562         return catalogue.models.Book.objects.get(id=self.book_id)
563
564     book = property(get_book)
565
566     def process_hits(self):
567         frags = filter(lambda r: r[1] is not None, self.hits)
568         sect = filter(lambda r: r[1] is None, self.hits)
569         sect = filter(lambda s: 0 == len(filter(
570             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
571             frags)), sect)
572
573         hits = []
574
575         for s in sect:
576             m = {'score': s[2],
577                  'header_index': s[0][1]
578                  }
579             m.update(s[3])
580             hits.append(m)
581
582         for f in frags:
583             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
584             m = {'score': f[2],
585                  'fragment': frag,
586                  'themes': frag.tags.filter(category='theme')
587                  }
588             m.update(f[3])
589             hits.append(m)
590
591         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
592
593         print("--- %s" % hits)
594
595         return hits
596
597     def __unicode__(self):
598         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
599
600     @staticmethod
601     def aggregate(*result_lists):
602         books = {}
603         for rl in result_lists:
604             for r in rl:
605                 if r.book_id in books:
606                     books[r.book_id].merge(r)
607                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
608                 else:
609                     books[r.book_id] = r
610         return books.values()
611
612     def __cmp__(self, other):
613         return cmp(self.score, other.score)
614
615
616 class Hint(object):
617     def __init__(self, search):
618         self.search = search
619         self.book_tags = {}
620         self.part_tags = []
621         self._book = None
622
623     def book(self, book):
624         self._book = book
625
626     def tags(self, tags):
627         for t in tags:
628             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
629                 lst = self.book_tags.get(t.category, [])
630                 lst.append(t)
631                 self.book_tags[t.category] = lst
632             if t.category in ['theme']:
633                 self.part_tags.append(t)
634
635     def tag_filter(self, tags, field='tags'):
636         q = BooleanQuery()
637
638         for tag in tags:
639             toks = self.search.get_tokens(tag.name, field=field)
640             tag_phrase = PhraseQuery()
641             for tok in toks:
642                 tag_phrase.add(Term(field, tok))
643             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
644
645         return QueryWrapperFilter(q)
646
647     def book_filter(self):
648         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
649         if tags:
650             return self.tag_filter(tags)
651         else:
652             return None
653
654     def part_filter(self):
655         fs = []
656         if self.part_tags:
657             fs.append(self.tag_filter(self.part_tags, field='themes'))
658         if self._book is not None:
659             fs.append(NumericRangeFilter.newIntRange('book_id', self._book.id, self._book.id, True, True))
660         return MultiSearch.chain_filters(fs)
661             
662     def should_search_for_book(self):
663         return self._book is None
664
665     def just_search_in(self, all):
666         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
667         some = []
668         for field in all:
669             if field == 'author' and 'author' in self.book_tags:
670                 continue
671             if field == 'title' and self._book is not None:
672                 continue
673             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
674                 continue
675             some.append(field)
676         return some
677
678
679 class MultiSearch(Search):
680     """Class capable of IMDb-like searching"""
681     def get_tokens(self, searched, field='content'):
682         """returns tokens analyzed by a proper (for a field) analyzer
683         argument can be: StringReader, string/unicode, or tokens. In the last case
684         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
685         """
686         if isinstance(searched, str) or isinstance(searched, unicode):
687             searched = StringReader(searched)
688         elif isinstance(searched, list):
689             return searched
690
691         searched.reset()
692         tokens = self.analyzer.reusableTokenStream(field, searched)
693         toks = []
694         while tokens.incrementToken():
695             cta = tokens.getAttribute(CharTermAttribute.class_)
696             toks.append(cta.toString())
697         return toks
698
699     def fuzziness(self, fuzzy):
700         if not fuzzy:
701             return None
702         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
703             return fuzzy
704         else:
705             return 0.5
706
707     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
708         if fuzzy:
709             phrase = MultiPhraseQuery()
710             for t in tokens:
711                 term = Term(field, t)
712                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
713                 fuzzterms = []
714
715                 while True:
716                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
717                     ft = fuzzterm.term()
718                     if ft:
719                         fuzzterms.append(ft)
720                     if not fuzzterm.next(): break
721                 if fuzzterms:
722                     phrase.add(JArray('object')(fuzzterms, Term))
723                 else:
724                     phrase.add(term)
725         else:
726             phrase = PhraseQuery()
727             phrase.setSlop(slop)
728             for t in tokens:
729                 term = Term(field, t)
730                 phrase.add(term)
731         return phrase
732
733     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
734         q = BooleanQuery()
735         for t in tokens:
736             term = Term(field, t)
737             if fuzzy:
738                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
739             else:
740                 term = TermQuery(term)
741             q.add(BooleanClause(term, modal))
742         return q
743
744     def content_query(self, query):
745         return BlockJoinQuery(query, self.parent_filter,
746                               BlockJoinQuery.ScoreMode.Total)
747
748     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
749         fields_to_search = ['author', 'title']
750         only_in = None
751         if hint:
752             if not hint.should_search_for_book():
753                 return []
754             fields_to_search = hint.just_search_in(fields_to_search)
755             only_in = hint.book_filter()
756
757         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
758
759         books = []
760         for q in qrys:
761             top = self.searcher.search(q,
762                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
763                 max_results)
764             for found in top.scoreDocs:
765                 books.append(SearchResult(self.searcher, found))
766         return books
767
768     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
769         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
770
771         flt = None
772         if hint:
773             flt = hint.part_filter()
774
775         books = []
776         for q in qrys:
777             top = self.searcher.search(q,
778                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
779                                                            flt
780                                                           ]),
781                                        max_results)
782             for found in top.scoreDocs:
783                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
784
785         return books
786
787     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
788         books = []
789         only_in = None
790
791         if hint:
792             only_in = hint.part_filter()
793
794         # content only query : themes x content
795         q = BooleanQuery()
796
797         tokens = self.get_tokens(searched)
798         if hint is None or hint.just_search_in(['themes_pl']) != []:
799             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
800                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
801
802         q.add(BooleanClause(self.make_term_query(tokens, field='content',
803                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
804
805         topDocs = self.searcher.search(q, only_in, max_results)
806         for found in topDocs.scoreDocs:
807             books.append(SearchResult(self.searcher, found))
808
809         # query themes/content x author/title/tags
810         q = BooleanQuery()
811         in_meta = BooleanQuery()
812         in_content = BooleanQuery()
813
814         for fld in ['themes', 'content', 'tags', 'author', 'title']:
815             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
816
817         topDocs = self.searcher.search(q, only_in, max_results)
818         for found in topDocs.scoreDocs:
819             books.append(SearchResult(self.searcher, found))
820
821         return books
822     
823
824     def multisearch(self, query, max_results=50):
825         """
826         Search strategy:
827         - (phrase) OR -> content
828                       -> title
829                       -> author
830         - (keywords)  -> author
831                       -> motyw
832                       -> tags
833                       -> content
834         """
835         # queryreader = StringReader(query)
836         # tokens = self.get_tokens(queryreader)
837
838         # top_level = BooleanQuery()
839         # Should = BooleanClause.Occur.SHOULD
840
841         # phrase_level = BooleanQuery()
842         # phrase_level.setBoost(1.3)
843
844         # p_content = self.make_phrase(tokens, joined=True)
845         # p_title = self.make_phrase(tokens, 'title')
846         # p_author = self.make_phrase(tokens, 'author')
847
848         # phrase_level.add(BooleanClause(p_content, Should))
849         # phrase_level.add(BooleanClause(p_title, Should))
850         # phrase_level.add(BooleanClause(p_author, Should))
851
852         # kw_level = BooleanQuery()
853
854         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
855         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
856         # kw_level.add(j_themes, Should)
857         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
858         # j_con = self.make_term_query(tokens, joined=True)
859         # kw_level.add(j_con, Should)
860
861         # top_level.add(BooleanClause(phrase_level, Should))
862         # top_level.add(BooleanClause(kw_level, Should))
863
864         return None
865
866     def book_search(self, query, filter=None, max_results=50, collector=None):
867         tops = self.searcher.search(query, filter, max_results)
868         #tops = self.searcher.search(p_content, max_results)
869
870         bks = []
871         for found in tops.scoreDocs:
872             doc = self.searcher.doc(found.doc)
873             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
874             bks.append(b)
875             print "%s (%d) -> %f" % (b, b.id, found.score)
876         return bks
877
878     def get_snippets(self, scoreDoc, query, field='content'):
879         htmlFormatter = SimpleHTMLFormatter()
880         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
881
882         stored = self.searcher.doc(scoreDoc.doc)
883
884         # locate content.
885         snippets = Snippets(stored.get('book_id')).open()
886         try:
887             text = snippets.get((int(stored.get('snippets_position')),
888                                  int(stored.get('snippets_length'))))
889         finally:
890             snippets.close()
891
892         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
893         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
894         #        import pdb; pdb.set_trace()
895         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
896         print('snips: %s' % snip)
897
898         return [snip]
899
900     @staticmethod
901     def enum_to_array(enum):
902         """
903         Converts a lucene TermEnum to array of Terms, suitable for
904         addition to queries
905         """
906         terms = []
907
908         while True:
909             t = enum.term()
910             if t:
911                 terms.append(t)
912             if not enum.next(): break
913
914         if terms:
915             return JArray('object')(terms, Term)
916
917     def search_tags(self, query, filter=None, max_results=40):
918         tops = self.searcher.search(query, filter, max_results)
919
920         tags = []
921         for found in tops.scoreDocs:
922             doc = self.searcher.doc(found.doc)
923             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
924             tags.append(tag)
925             print "%s (%d) -> %f" % (tag, tag.id, found.score)
926
927         return tags
928
929     def create_prefix_phrase(self, toks, field):
930         q = MultiPhraseQuery()
931         for i in range(len(toks)):
932             t = Term(field, toks[i])
933             if i == len(toks) - 1:
934                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
935                 if pterms:
936                     q.add(pterms)
937                 else:
938                     q.add(t)
939             else:
940                 q.add(t)
941         return q
942
943     @staticmethod
944     def term_filter(term, inverse=False):
945         only_term = TermsFilter()
946         only_term.addTerm(term)
947
948         if inverse:
949             neg = BooleanFilter()
950             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
951             only_term = neg
952
953         return only_term
954
955     def hint_tags(self, string, max_results=50):
956         toks = self.get_tokens(string, field='SIMPLE')
957         top = BooleanQuery()
958
959         for field in ['tag_name', 'tag_name_pl']:
960             q = self.create_prefix_phrase(toks, field)
961             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
962
963         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
964
965         return self.search_tags(top, no_book_cat, max_results=max_results)
966
967     def hint_books(self, string, max_results=50):
968         toks = self.get_tokens(string, field='SIMPLE')
969
970         q = self.create_prefix_phrase(toks, 'title')
971
972         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
973
974     @staticmethod
975     def chain_filters(filters, op=ChainedFilter.AND):
976         filters = filter(lambda x: x is not None, filters)
977         if not filters:
978             return None
979         chf = ChainedFilter(JArray('object')(filters, Filter), op)
980         return chf
981
982     def filtered_categories(self, tags):
983         cats = {}
984         for t in tags:
985             cats[t.category] = True
986         return cats.keys()
987
988     def hint(self):
989         return Hint(self)