some css tweaks regarding colors
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
31 import atexit
32 import traceback
33
34
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
36     def __init__(self):
37         polish = PolishAnalyzer(Version.LUCENE_34)
38         #        polish_gap.setPositionIncrementGap(999)
39
40         simple = SimpleAnalyzer(Version.LUCENE_34)
41         #        simple_gap.setPositionIncrementGap(999)
42
43         keyword = KeywordAnalyzer(Version.LUCENE_34)
44
45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
46
47         PerFieldAnalyzerWrapper.__init__(self, polish)
48
49         self.addAnalyzer("tags", simple)
50         self.addAnalyzer("technical_editors", simple)
51         self.addAnalyzer("editors", simple)
52         self.addAnalyzer("url", keyword)
53         self.addAnalyzer("source_url", keyword)
54         self.addAnalyzer("source_name", simple)
55         self.addAnalyzer("publisher", simple)
56         self.addAnalyzer("authors", simple)
57         self.addAnalyzer("is_book", keyword)
58         # shouldn't the title have two forms? _pl and simple?
59
60         self.addAnalyzer("themes", simple)
61         self.addAnalyzer("themes_pl", polish)
62
63         self.addAnalyzer("tag_name", simple)
64         self.addAnalyzer("tag_name_pl", polish)
65
66         self.addAnalyzer("translators", simple)
67
68         self.addAnalyzer("KEYWORD", keyword)
69         self.addAnalyzer("SIMPLE", simple)
70         self.addAnalyzer("POLISH", polish)
71
72
73 class IndexStore(object):
74     def __init__(self):
75         self.make_index_dir()
76         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
77
78     def make_index_dir(self):
79         try:
80             os.makedirs(settings.SEARCH_INDEX)
81         except OSError as exc:
82             if exc.errno == errno.EEXIST:
83                 pass
84             else: raise
85
86
87 class IndexChecker(IndexStore):
88     def __init__(self):
89         IndexStore.__init__(self)
90
91     def check(self):
92         checker = CheckIndex(self.store)
93         status = checker.checkIndex()
94         return status
95
96
97 class Snippets(object):
98     SNIPPET_DIR = "snippets"
99
100     def __init__(self, book_id):
101         try:
102             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
103         except OSError as exc:
104             if exc.errno == errno.EEXIST:
105                 pass
106             else: raise
107         self.book_id = book_id
108         self.file = None
109
110     def open(self, mode='r'):
111         if not 'b' in mode:
112             mode += 'b'
113         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
114         self.position = 0
115         return self
116
117     def add(self, snippet):
118         txt = snippet.encode('utf-8')
119         l = len(txt)
120         self.file.write(txt)
121         pos = (self.position, l)
122         self.position += l
123         return pos
124
125     def get(self, pos):
126         self.file.seek(pos[0], 0)
127         txt = self.file.read(pos[1]).decode('utf-8')
128         return txt
129
130     def close(self):
131         self.file.close()
132
133
134 class Index(IndexStore):
135     def __init__(self, analyzer=None):
136         IndexStore.__init__(self)
137         self.index = None
138         if not analyzer:
139             analyzer = WLAnalyzer()
140         self.analyzer = analyzer
141
142     def open(self, analyzer=None):
143         if self.index:
144             raise Exception("Index is already opened")
145         self.index = IndexWriter(self.store, self.analyzer,\
146                                  IndexWriter.MaxFieldLength.LIMITED)
147         return self.index
148
149     def optimize(self):
150         self.index.optimize()
151
152     def close(self):
153         try:
154             self.index.optimize()
155         except JavaError, je:
156             print "Error during optimize phase, check index: %s" % je
157
158         self.index.close()
159         self.index = None
160
161     def index_tags(self):
162         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
163         self.index.deleteDocuments(q)
164
165         for tag in catalogue.models.Tag.objects.all():
166             doc = Document()
167             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
168             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
169             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
170             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
171             self.index.addDocument(doc)
172
173     def remove_book(self, book):
174         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
175         self.index.deleteDocuments(q)
176
177     def index_book(self, book, book_info=None, overwrite=True):
178         if overwrite:
179             self.remove_book(book)
180
181         book_doc = self.create_book_doc(book)
182         meta_fields = self.extract_metadata(book, book_info)
183         for f in meta_fields.values():
184             if isinstance(f, list) or isinstance(f, tuple):
185                 for elem in f:
186                     book_doc.add(elem)
187             else:
188                 book_doc.add(f)
189
190         self.index.addDocument(book_doc)
191         del book_doc
192
193         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
194
195     master_tags = [
196         'opowiadanie',
197         'powiesc',
198         'dramat_wierszowany_l',
199         'dramat_wierszowany_lp',
200         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
201         'wywiad'
202         ]
203
204     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
205
206     def create_book_doc(self, book):
207         """
208         Create a lucene document connected to the book
209         """
210         doc = Document()
211         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
212         if book.parent is not None:
213             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
214         return doc
215
216     def extract_metadata(self, book, book_info=None):
217         fields = {}
218
219         if book_info is None:
220             book_info = dcparser.parse(open(book.xml_file.path))
221
222         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
223         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
224         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
225
226         # validator, name
227         for field in dcparser.BookInfo.FIELDS:
228             if hasattr(book_info, field.name):
229                 if not getattr(book_info, field.name):
230                     continue
231                 # since no type information is available, we use validator
232                 type_indicator = field.validator
233                 if type_indicator == dcparser.as_unicode:
234                     s = getattr(book_info, field.name)
235                     if field.multiple:
236                         s = ', '.join(s)
237                     try:
238                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
239                     except JavaError as je:
240                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
241                 elif type_indicator == dcparser.as_person:
242                     p = getattr(book_info, field.name)
243                     if isinstance(p, dcparser.Person):
244                         persons = unicode(p)
245                     else:
246                         persons = ', '.join(map(unicode, p))
247                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
248                 elif type_indicator == dcparser.as_date:
249                     dt = getattr(book_info, field.name)
250                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
251                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
252
253         return fields
254
255     def get_master(self, root):
256         for master in root.iter():
257             if master.tag in self.master_tags:
258                 return master
259
260     def add_gaps(self, fields, fieldname):
261         def gap():
262             while True:
263                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
264         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
265
266     def index_content(self, book, book_fields=[]):
267         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
268         root = wld.edoc.getroot()
269
270         master = self.get_master(root)
271         if master is None:
272             return []
273
274         def walker(node):
275             yield node, None
276             for child in list(node):
277                 for b, e in walker(child):
278                     yield b, e
279             yield None, node
280             return
281
282         def fix_format(text):
283             return re.sub("(?m)/$", "", text)
284
285         def add_part(snippets, **fields):
286             doc = self.create_book_doc(book)
287             for f in book_fields:
288                 doc.add(f)
289
290             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
291             doc.add(NumericField("header_span", Field.Store.YES, True)\
292                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
293             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
294
295             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
296                           Field.TermVector.WITH_POSITIONS_OFFSETS))
297
298             snip_pos = snippets.add(fields["content"])
299             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
300             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
301
302             if 'fragment_anchor' in fields:
303                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
304                               Field.Store.YES, Field.Index.NOT_ANALYZED))
305
306             if 'themes' in fields:
307                 themes, themes_pl = zip(*[
308                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
309                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
310                      for theme in fields['themes']])
311
312                 themes = self.add_gaps(themes, 'themes')
313                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
314
315                 for t in themes:
316                     doc.add(t)
317                 for t in themes_pl:
318                     doc.add(t)
319
320             return doc
321
322         def give_me_utf8(s):
323             if isinstance(s, unicode):
324                 return s.encode('utf-8')
325             else:
326                 return s
327
328
329         fragments = {}
330         snippets = Snippets(book.id).open('w')
331         try:
332             for header, position in zip(list(master), range(len(master))):
333
334                 if header.tag in self.skip_header_tags:
335                     continue
336
337                 content = u' '.join([t for t in header.itertext()])
338                 content = fix_format(content)
339
340                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
341
342                 self.index.addDocument(doc)
343
344                 for start, end in walker(header):
345                     if start is not None and start.tag == 'begin':
346                         fid = start.attrib['id'][1:]
347                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
348                         fragments[fid]['content'].append(start.tail)
349                     elif start is not None and start.tag == 'motyw':
350                         fid = start.attrib['id'][1:]
351                         if start.text is not None:
352                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
353                         fragments[fid]['content'].append(start.tail)
354                     elif start is not None and start.tag == 'end':
355                         fid = start.attrib['id'][1:]
356                         if fid not in fragments:
357                             continue  # a broken <end> node, skip it
358                         frag = fragments[fid]
359                         if frag['themes'] == []:
360                             continue  # empty themes list.
361                         del fragments[fid]
362
363                         def jstr(l):
364                             return u' '.join(map(
365                                 lambda x: x == None and u'(none)' or unicode(x),
366                                 l))
367
368                         doc = add_part(snippets,
369                                        header_type=frag['start_header'],
370                                        header_index=frag['start_section'],
371                                        header_span=position - frag['start_section'] + 1,
372                                        fragment_anchor=fid,
373                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
374                                        themes=frag['themes'])
375
376                         self.index.addDocument(doc)
377                     elif start is not None:
378                         for frag in fragments.values():
379                             frag['content'].append(start.text)
380                     elif end is not None:
381                         for frag in fragments.values():
382                             frag['content'].append(end.tail)
383         finally:
384             snippets.close()
385
386
387     def __enter__(self):
388         self.open()
389         return self
390
391     def __exit__(self, type, value, tb):
392         self.close()
393
394
395 def log_exception_wrapper(f):
396     def _wrap(*a):
397         try:
398             f(*a)
399         except Exception, e:
400             print("Error in indexing thread: %s" % e)
401             traceback.print_exc()
402             raise e
403     return _wrap
404
405
406 class ReusableIndex(Index):
407     """
408     Works like index, but does not close/optimize Lucene index
409     until program exit (uses atexit hook).
410     This is usefull for importbooks command.
411
412     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
413     """
414     index = None
415
416     def open(self, analyzer=None, threads=4):
417         if ReusableIndex.index is not None:
418             self.index = ReusableIndex.index
419         else:
420             print("opening index")
421             Index.open(self, analyzer)
422             ReusableIndex.index = self.index
423             atexit.register(ReusableIndex.close_reusable)
424
425     # def index_book(self, *args, **kw):
426     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
427     #     ReusableIndex.pool_jobs.append(job)
428
429     @staticmethod
430     def close_reusable():
431         if ReusableIndex.index is not None:
432             ReusableIndex.index.optimize()
433             ReusableIndex.index.close()
434             ReusableIndex.index = None
435
436     def close(self):
437         pass
438
439
440 class Search(IndexStore):
441     def __init__(self, default_field="content"):
442         IndexStore.__init__(self)
443         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
444         ## self.analyzer = WLAnalyzer()
445         self.searcher = IndexSearcher(self.store, True)
446         self.parser = QueryParser(Version.LUCENE_34, default_field,
447                                   self.analyzer)
448
449         self.parent_filter = TermsFilter()
450         self.parent_filter.addTerm(Term("is_book", "true"))
451
452     def query(self, query):
453         return self.parser.parse(query)
454
455     def wrapjoins(self, query, fields=[]):
456         """
457         This functions modifies the query in a recursive way,
458         so Term and Phrase Queries contained, which match
459         provided fields are wrapped in a BlockJoinQuery,
460         and so delegated to children documents.
461         """
462         if BooleanQuery.instance_(query):
463             qs = BooleanQuery.cast_(query)
464             for clause in qs:
465                 clause = BooleanClause.cast_(clause)
466                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
467             return qs
468         else:
469             termset = HashSet()
470             query.extractTerms(termset)
471             for t in termset:
472                 t = Term.cast_(t)
473                 if t.field() not in fields:
474                     return query
475             return BlockJoinQuery(query, self.parent_filter,
476                                   BlockJoinQuery.ScoreMode.Total)
477
478     def simple_search(self, query, max_results=50):
479         """Returns (books, total_hits)
480         """
481
482         tops = self.searcher.search(self.query(query), max_results)
483         bks = []
484         for found in tops.scoreDocs:
485             doc = self.searcher.doc(found.doc)
486             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
487         return (bks, tops.totalHits)
488
489
490     def search(self, query, max_results=50):
491         query = self.query(query)
492         query = self.wrapjoins(query, ["content", "themes"])
493
494         tops = self.searcher.search(query, max_results)
495         bks = []
496         for found in tops.scoreDocs:
497             doc = self.searcher.doc(found.doc)
498             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
499         return (bks, tops.totalHits)
500
501     def bsearch(self, query, max_results=50):
502         q = self.query(query)
503         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
504
505         tops = self.searcher.search(bjq, max_results)
506         bks = []
507         for found in tops.scoreDocs:
508             doc = self.searcher.doc(found.doc)
509             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
510         return (bks, tops.totalHits)
511
512 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
513 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
514 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
515
516 # while (tokenStream.incrementToken()) {
517 #     int startOffset = offsetAttribute.startOffset();
518 #     int endOffset = offsetAttribute.endOffset();
519 #     String term = charTermAttribute.toString();
520 # }
521
522
523 class SearchResult(object):
524     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
525         self.snippets = []
526
527         if score:
528             self.score = score
529         else:
530             self.score = scoreDocs.score
531
532         self.hits = []
533
534         stored = searcher.doc(scoreDocs.doc)
535         self.book_id = int(stored.get("book_id"))
536
537         header_type = stored.get("header_type")
538         if not header_type:
539             return
540
541         sec = (header_type, int(stored.get("header_index")))
542         header_span = stored.get('header_span')
543         header_span = header_span is not None and int(header_span) or 1
544
545         fragment = stored.get("fragment_anchor")
546
547         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets})
548
549         self.hits.append(hit)
550
551     def merge(self, other):
552         if self.book_id != other.book_id:
553             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
554         self.hits += other.hits
555         if other.score > self.score:
556             self.score = other.score
557         return self
558
559     def get_book(self):
560         return catalogue.models.Book.objects.get(id=self.book_id)
561
562     book = property(get_book)
563
564     def process_hits(self):
565         frags = filter(lambda r: r[1] is not None, self.hits)
566         sect = filter(lambda r: r[1] is None, self.hits)
567         sect = filter(lambda s: 0 == len(filter(
568             lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
569             frags)), sect)
570
571         hits = []
572
573         for s in sect:
574             m = {'score': s[2],
575                  'header_index': s[0][1]
576                  }
577             m.update(s[3])
578             hits.append(m)
579
580         for f in frags:
581             frag = catalogue.models.Fragment.objects.get(anchor=f[1])
582             m = {'score': f[2],
583                  'fragment': frag,
584                  'themes': frag.tags.filter(category='theme')
585                  }
586             m.update(f[3])
587             hits.append(m)
588
589         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
590
591         print("--- %s" % hits)
592
593         return hits
594
595     def __unicode__(self):
596         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
597
598     @staticmethod
599     def aggregate(*result_lists):
600         books = {}
601         for rl in result_lists:
602             for r in rl:
603                 if r.book_id in books:
604                     books[r.book_id].merge(r)
605                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
606                 else:
607                     books[r.book_id] = r
608         return books.values()
609
610     def __cmp__(self, other):
611         return cmp(self.score, other.score)
612
613
614 class Hint(object):
615     def __init__(self, search):
616         self.search = search
617         self.book_tags = {}
618         self.part_tags = []
619         self._books = []
620
621     def books(self, *books):
622         self._books = books
623
624     def tags(self, tags):
625         for t in tags:
626             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
627                 lst = self.book_tags.get(t.category, [])
628                 lst.append(t)
629                 self.book_tags[t.category] = lst
630             if t.category in ['theme']:
631                 self.part_tags.append(t)
632
633     def tag_filter(self, tags, field='tags'):
634         q = BooleanQuery()
635
636         for tag in tags:
637             toks = self.search.get_tokens(tag.name, field=field)
638             tag_phrase = PhraseQuery()
639             for tok in toks:
640                 tag_phrase.add(Term(field, tok))
641             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
642
643         return QueryWrapperFilter(q)
644
645     def book_filter(self):
646         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
647         if tags:
648             return self.tag_filter(tags)
649         else:
650             return None
651
652     def part_filter(self):
653         fs = []
654         if self.part_tags:
655             fs.append(self.tag_filter(self.part_tags, field='themes'))
656
657         if self._books != []:
658             bf = BooleanFilter()
659             for b in self._books:
660                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
661                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
662             fs.append(bf)
663
664         return MultiSearch.chain_filters(fs)
665
666     def should_search_for_book(self):
667         return self._books == []
668
669     def just_search_in(self, all):
670         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
671         some = []
672         for field in all:
673             if field == 'authors' and 'author' in self.book_tags:
674                 continue
675             if field == 'title' and self._books != []:
676                 continue
677             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
678                 continue
679             some.append(field)
680         return some
681
682
683 class MultiSearch(Search):
684     """Class capable of IMDb-like searching"""
685     def get_tokens(self, searched, field='content'):
686         """returns tokens analyzed by a proper (for a field) analyzer
687         argument can be: StringReader, string/unicode, or tokens. In the last case
688         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
689         """
690         if isinstance(searched, str) or isinstance(searched, unicode):
691             searched = StringReader(searched)
692         elif isinstance(searched, list):
693             return searched
694
695         searched.reset()
696         tokens = self.analyzer.reusableTokenStream(field, searched)
697         toks = []
698         while tokens.incrementToken():
699             cta = tokens.getAttribute(CharTermAttribute.class_)
700             toks.append(cta.toString())
701         return toks
702
703     def fuzziness(self, fuzzy):
704         if not fuzzy:
705             return None
706         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
707             return fuzzy
708         else:
709             return 0.5
710
711     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
712         if fuzzy:
713             phrase = MultiPhraseQuery()
714             for t in tokens:
715                 term = Term(field, t)
716                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
717                 fuzzterms = []
718
719                 while True:
720                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
721                     ft = fuzzterm.term()
722                     if ft:
723                         fuzzterms.append(ft)
724                     if not fuzzterm.next(): break
725                 if fuzzterms:
726                     phrase.add(JArray('object')(fuzzterms, Term))
727                 else:
728                     phrase.add(term)
729         else:
730             phrase = PhraseQuery()
731             phrase.setSlop(slop)
732             for t in tokens:
733                 term = Term(field, t)
734                 phrase.add(term)
735         return phrase
736
737     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
738         q = BooleanQuery()
739         for t in tokens:
740             term = Term(field, t)
741             if fuzzy:
742                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
743             else:
744                 term = TermQuery(term)
745             q.add(BooleanClause(term, modal))
746         return q
747
748     # def content_query(self, query):
749     #     return BlockJoinQuery(query, self.parent_filter,
750     #                           BlockJoinQuery.ScoreMode.Total)
751
752     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
753         fields_to_search = ['authors', 'title']
754         only_in = None
755         if hint:
756             if not hint.should_search_for_book():
757                 return []
758             fields_to_search = hint.just_search_in(fields_to_search)
759             only_in = hint.book_filter()
760
761         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
762
763         books = []
764         for q in qrys:
765             top = self.searcher.search(q,
766                                        self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
767                 max_results)
768             for found in top.scoreDocs:
769                 books.append(SearchResult(self.searcher, found))
770         return books
771
772     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
773         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
774
775         flt = None
776         if hint:
777             flt = hint.part_filter()
778
779         books = []
780         for q in qrys:
781             top = self.searcher.search(q,
782                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
783                                                            flt
784                                                           ]),
785                                        max_results)
786             for found in top.scoreDocs:
787                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
788
789         return books
790
791     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
792         books = []
793         only_in = None
794
795         if hint:
796             only_in = hint.part_filter()
797
798         # content only query : themes x content
799         q = BooleanQuery()
800
801         tokens = self.get_tokens(searched)
802         if hint is None or hint.just_search_in(['themes_pl']) != []:
803             q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
804                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
805
806         q.add(BooleanClause(self.make_term_query(tokens, field='content',
807                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
808
809         topDocs = self.searcher.search(q, only_in, max_results)
810         for found in topDocs.scoreDocs:
811             books.append(SearchResult(self.searcher, found))
812
813         # query themes/content x author/title/tags
814         q = BooleanQuery()
815         #        in_meta = BooleanQuery()
816         in_content = BooleanQuery()
817
818         for fld in ['themes', 'content', 'tags', 'authors', 'title']:
819             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
820
821         topDocs = self.searcher.search(q, only_in, max_results)
822         for found in topDocs.scoreDocs:
823             books.append(SearchResult(self.searcher, found))
824
825         return books
826
827     def multisearch(self, query, max_results=50):
828         """
829         Search strategy:
830         - (phrase) OR -> content
831                       -> title
832                       -> authors
833         - (keywords)  -> authors
834                       -> motyw
835                       -> tags
836                       -> content
837         """
838         # queryreader = StringReader(query)
839         # tokens = self.get_tokens(queryreader)
840
841         # top_level = BooleanQuery()
842         # Should = BooleanClause.Occur.SHOULD
843
844         # phrase_level = BooleanQuery()
845         # phrase_level.setBoost(1.3)
846
847         # p_content = self.make_phrase(tokens, joined=True)
848         # p_title = self.make_phrase(tokens, 'title')
849         # p_author = self.make_phrase(tokens, 'author')
850
851         # phrase_level.add(BooleanClause(p_content, Should))
852         # phrase_level.add(BooleanClause(p_title, Should))
853         # phrase_level.add(BooleanClause(p_author, Should))
854
855         # kw_level = BooleanQuery()
856
857         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
858         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
859         # kw_level.add(j_themes, Should)
860         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
861         # j_con = self.make_term_query(tokens, joined=True)
862         # kw_level.add(j_con, Should)
863
864         # top_level.add(BooleanClause(phrase_level, Should))
865         # top_level.add(BooleanClause(kw_level, Should))
866
867         return None
868
869     def book_search(self, query, filter=None, max_results=50, collector=None):
870         tops = self.searcher.search(query, filter, max_results)
871         #tops = self.searcher.search(p_content, max_results)
872
873         bks = []
874         for found in tops.scoreDocs:
875             doc = self.searcher.doc(found.doc)
876             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
877             bks.append(b)
878             print "%s (%d) -> %f" % (b, b.id, found.score)
879         return bks
880
881     def get_snippets(self, scoreDoc, query, field='content'):
882         htmlFormatter = SimpleHTMLFormatter()
883         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
884
885         stored = self.searcher.doc(scoreDoc.doc)
886
887         # locate content.
888         snippets = Snippets(stored.get('book_id')).open()
889         try:
890             text = snippets.get((int(stored.get('snippets_position')),
891                                  int(stored.get('snippets_length'))))
892         finally:
893             snippets.close()
894
895         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
896         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
897         #        import pdb; pdb.set_trace()
898         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
899
900         return [snip]
901
902     @staticmethod
903     def enum_to_array(enum):
904         """
905         Converts a lucene TermEnum to array of Terms, suitable for
906         addition to queries
907         """
908         terms = []
909
910         while True:
911             t = enum.term()
912             if t:
913                 terms.append(t)
914             if not enum.next(): break
915
916         if terms:
917             return JArray('object')(terms, Term)
918
919     def search_tags(self, query, filter=None, max_results=40):
920         tops = self.searcher.search(query, filter, max_results)
921
922         tags = []
923         for found in tops.scoreDocs:
924             doc = self.searcher.doc(found.doc)
925             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
926             tags.append(tag)
927             print "%s (%d) -> %f" % (tag, tag.id, found.score)
928
929         return tags
930
931     def search_books(self, query, filter=None, max_results=10):
932         bks = []
933         tops = self.searcher.search(query, filter, max_results)
934         for found in tops.scoreDocs:
935             doc = self.searcher.doc(found.doc)
936             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
937         return bks
938
939     def create_prefix_phrase(self, toks, field):
940         q = MultiPhraseQuery()
941         for i in range(len(toks)):
942             t = Term(field, toks[i])
943             if i == len(toks) - 1:
944                 pterms = MultiSearch.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
945                 if pterms:
946                     q.add(pterms)
947                 else:
948                     q.add(t)
949             else:
950                 q.add(t)
951         return q
952
953     @staticmethod
954     def term_filter(term, inverse=False):
955         only_term = TermsFilter()
956         only_term.addTerm(term)
957
958         if inverse:
959             neg = BooleanFilter()
960             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
961             only_term = neg
962
963         return only_term
964
965     def hint_tags(self, string, max_results=50):
966         toks = self.get_tokens(string, field='SIMPLE')
967         top = BooleanQuery()
968
969         for field in ['tag_name', 'tag_name_pl']:
970             q = self.create_prefix_phrase(toks, field)
971             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
972
973         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
974
975         return self.search_tags(top, no_book_cat, max_results=max_results)
976
977     def hint_books(self, string, max_results=50):
978         toks = self.get_tokens(string, field='SIMPLE')
979
980         q = self.create_prefix_phrase(toks, 'title')
981
982         return self.book_search(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
983
984     @staticmethod
985     def chain_filters(filters, op=ChainedFilter.AND):
986         filters = filter(lambda x: x is not None, filters)
987         if not filters:
988             return None
989         chf = ChainedFilter(JArray('object')(filters, Filter), op)
990         return chf
991
992     def filtered_categories(self, tags):
993         cats = {}
994         for t in tags:
995             cats[t.category] = True
996         return cats.keys()
997
998     def hint(self):
999         return Hint(self)