check for empty source in dc
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5     File, Field, Integer, \
6     NumericField, Version, Document, JavaError, IndexSearcher, \
7     QueryParser, PerFieldAnalyzerWrapper, \
8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11     HashSet, BooleanClause, Term, CharTermAttribute, \
12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16     initVM, CLASSPATH, JArray, JavaError
17     # KeywordAnalyzer
18
19 # Initialize jvm
20 JVM = initVM(CLASSPATH)
21
22 import sys
23 import os
24 import re
25 import errno
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
33 import atexit
34 import traceback
35
36
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
38     def __init__(self):
39         polish = PolishAnalyzer(Version.LUCENE_34)
40         #        polish_gap.setPositionIncrementGap(999)
41
42         simple = SimpleAnalyzer(Version.LUCENE_34)
43         #        simple_gap.setPositionIncrementGap(999)
44
45         keyword = KeywordAnalyzer(Version.LUCENE_34)
46
47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
48
49         PerFieldAnalyzerWrapper.__init__(self, polish)
50
51         self.addAnalyzer("tags", simple)
52         self.addAnalyzer("technical_editors", simple)
53         self.addAnalyzer("editors", simple)
54         self.addAnalyzer("url", keyword)
55         self.addAnalyzer("source_url", keyword)
56         self.addAnalyzer("source_name", simple)
57         self.addAnalyzer("publisher", simple)
58         self.addAnalyzer("authors", simple)
59         self.addAnalyzer("title", simple)
60
61         self.addAnalyzer("is_book", keyword)
62         # shouldn't the title have two forms? _pl and simple?
63
64         self.addAnalyzer("themes", simple)
65         self.addAnalyzer("themes_pl", polish)
66
67         self.addAnalyzer("tag_name", simple)
68         self.addAnalyzer("tag_name_pl", polish)
69
70         self.addAnalyzer("translators", simple)
71
72         self.addAnalyzer("KEYWORD", keyword)
73         self.addAnalyzer("SIMPLE", simple)
74         self.addAnalyzer("POLISH", polish)
75
76
77 class IndexStore(object):
78     """
79     Provides access to search index.
80
81     self.store - lucene index directory
82     """
83     def __init__(self):
84         self.make_index_dir()
85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
86
87     def make_index_dir(self):
88         try:
89             os.makedirs(settings.SEARCH_INDEX)
90         except OSError as exc:
91             if exc.errno == errno.EEXIST:
92                 pass
93             else: raise
94
95
96 class IndexChecker(IndexStore):
97     def __init__(self):
98         IndexStore.__init__(self)
99
100     def check(self):
101         checker = CheckIndex(self.store)
102         status = checker.checkIndex()
103         return status
104
105
106 class Snippets(object):
107     """
108     This class manages snippet files for indexed object (book)
109     the snippets are concatenated together, and their positions and
110     lengths are kept in lucene index fields.
111     """
112     SNIPPET_DIR = "snippets"
113
114     def __init__(self, book_id):
115         try:
116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117         except OSError as exc:
118             if exc.errno == errno.EEXIST:
119                 pass
120             else: raise
121         self.book_id = book_id
122         self.file = None
123
124     def open(self, mode='r'):
125         """
126         Open the snippet file. Call .close() afterwards.
127         """
128         if not 'b' in mode:
129             mode += 'b'
130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
131         self.position = 0
132         return self
133
134     def add(self, snippet):
135         """
136         Append a snippet (unicode) to the snippet file.
137         Return a (position, length) tuple
138         """
139         txt = snippet.encode('utf-8')
140         l = len(txt)
141         self.file.write(txt)
142         pos = (self.position, l)
143         self.position += l
144         return pos
145
146     def get(self, pos):
147         """
148         Given a tuple of (position, length) return an unicode
149         of the snippet stored there.
150         """
151         self.file.seek(pos[0], 0)
152         txt = self.file.read(pos[1]).decode('utf-8')
153         return txt
154
155     def close(self):
156         """Close snippet file"""
157         self.file.close()
158
159
160 class BaseIndex(IndexStore):
161     """
162     Base index class.
163     Provides basic operations on index: opening, closing, optimizing.
164     """
165     def __init__(self, analyzer=None):
166         super(BaseIndex, self).__init__()
167         self.index = None
168         if not analyzer:
169             analyzer = WLAnalyzer()
170         self.analyzer = analyzer
171
172     def open(self, analyzer=None):
173         if self.index:
174             raise Exception("Index is already opened")
175         self.index = IndexWriter(self.store, self.analyzer,\
176                                  IndexWriter.MaxFieldLength.LIMITED)
177         return self.index
178
179     def optimize(self):
180         self.index.optimize()
181
182     def close(self):
183         try:
184             self.index.optimize()
185         except JavaError, je:
186             print "Error during optimize phase, check index: %s" % je
187
188         self.index.close()
189         self.index = None
190
191     def __enter__(self):
192         self.open()
193         return self
194
195     def __exit__(self, type, value, tb):
196         self.close()
197
198
199 class Index(BaseIndex):
200     """
201     Class indexing books.
202     """
203     def __init__(self, analyzer=None):
204         super(Index, self).__init__(analyzer)
205
206     def index_tags(self):
207         """
208         Re-index global tag list.
209         Removes all tags from index, then index them again.
210         Indexed fields include: id, name (with and without polish stems), category
211         """
212         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213         self.index.deleteDocuments(q)
214
215         for tag in catalogue.models.Tag.objects.all():
216             doc = Document()
217             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221             self.index.addDocument(doc)
222
223         for pdtag in PDCounterAuthor.objects.all():
224             doc = Document()
225             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228             doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230             self.index.addDocument(doc)
231
232     def create_book_doc(self, book):
233         """
234         Create a lucene document referring book id.
235         """
236         doc = Document()
237         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238         if book.parent is not None:
239             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
240         return doc
241
242     def remove_book(self, book):
243         """Removes a book from search index.
244         book - Book instance."""
245         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246         self.index.deleteDocuments(q)
247
248     def index_book(self, book, book_info=None, overwrite=True):
249         """
250         Indexes the book.
251         Creates a lucene document for extracted metadata
252         and calls self.index_content() to index the contents of the book.
253         """
254         if overwrite:
255             self.remove_book(book)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info)
259         for f in meta_fields.values():
260             if isinstance(f, list) or isinstance(f, tuple):
261                 for elem in f:
262                     book_doc.add(elem)
263             else:
264                 book_doc.add(f)
265
266         self.index.addDocument(book_doc)
267         del book_doc
268
269         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
270
271     master_tags = [
272         'opowiadanie',
273         'powiesc',
274         'dramat_wierszowany_l',
275         'dramat_wierszowany_lp',
276         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
277         'wywiad',
278         ]
279
280     ignore_content_tags = [
281         'uwaga', 'extra',
282         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
283         'didaskalia',
284         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
285         ]
286
287     footnote_tags = ['pa', 'pt', 'pr', 'pe']
288
289     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
290
291     published_date_re = re.compile("([0-9]+)[\]. ]*$")
292
293     def extract_metadata(self, book, book_info=None):
294         """
295         Extract metadata from book and returns a map of fields keyed by fieldname
296         """
297         fields = {}
298
299         if book_info is None:
300             book_info = dcparser.parse(open(book.xml_file.path))
301
302         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
305
306         # validator, name
307         for field in dcparser.BookInfo.FIELDS:
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     try:
318                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319                     except JavaError as je:
320                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321                 elif type_indicator == dcparser.as_person:
322                     p = getattr(book_info, field.name)
323                     if isinstance(p, dcparser.Person):
324                         persons = unicode(p)
325                     else:
326                         persons = ', '.join(map(unicode, p))
327                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328                 elif type_indicator == dcparser.as_date:
329                     dt = getattr(book_info, field.name)
330                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
332
333         # get published date
334         source = book_info.source_name
335         if hasattr(book_info, 'source_name'):
336             match = self.published_date_re.search(source)
337             if match is not None:
338                 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
339
340         return fields
341
342     def add_gaps(self, fields, fieldname):
343         """
344         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
346         """
347         def gap():
348             while True:
349                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
351
352     def get_master(self, root):
353         """
354         Returns the first master tag from an etree.
355         """
356         for master in root.iter():
357             if master.tag in self.master_tags:
358                 return master
359
360     def index_content(self, book, book_fields=[]):
361         """
362         Walks the book XML and extract content from it.
363         Adds parts for each header tag and for each fragment.
364         """
365         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366         root = wld.edoc.getroot()
367
368         master = self.get_master(root)
369         if master is None:
370             return []
371
372         def walker(node, ignore_tags=[]):
373             yield node, None
374             for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
375                 for b, e in walker(child):
376                     yield b, e
377             yield None, node
378             return
379
380         def fix_format(text):
381             #            separator = [u" ", u"\t", u".", u";", u","]
382             if isinstance(text, list):
383                 # need to join it first
384                 text = filter(lambda s: s is not None, content)
385                 text = u' '.join(text)
386                 # for i in range(len(text)):
387                 #     if i > 0:
388                 #         if text[i][0] not in separator\
389                 #             and text[i - 1][-1] not in separator:
390                 #          text.insert(i, u" ")
391
392             return re.sub("(?m)/$", "", text)
393
394         def add_part(snippets, **fields):
395             doc = self.create_book_doc(book)
396             for f in book_fields:
397                 doc.add(f)
398
399             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
400             doc.add(NumericField("header_span", Field.Store.YES, True)\
401                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
402             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
403
404             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
405                           Field.TermVector.WITH_POSITIONS_OFFSETS))
406
407             snip_pos = snippets.add(fields["content"])
408             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
409             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
410
411             if 'fragment_anchor' in fields:
412                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
413                               Field.Store.YES, Field.Index.NOT_ANALYZED))
414
415             if 'themes' in fields:
416                 themes, themes_pl = zip(*[
417                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
418                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
419                      for theme in fields['themes']])
420
421                 themes = self.add_gaps(themes, 'themes')
422                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
423
424                 for t in themes:
425                     doc.add(t)
426                 for t in themes_pl:
427                     doc.add(t)
428
429             return doc
430
431         def give_me_utf8(s):
432             if isinstance(s, unicode):
433                 return s.encode('utf-8')
434             else:
435                 return s
436
437         fragments = {}
438         snippets = Snippets(book.id).open('w')
439         try:
440             for header, position in zip(list(master), range(len(master))):
441
442                 if header.tag in self.skip_header_tags:
443                     continue
444                 if header.tag is etree.Comment:
445                     continue
446
447                 # section content
448                 content = []
449                 footnote = None
450
451                 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
452                     # handle footnotes
453                     # if start is not None and start.tag in self.footnote_tags:
454                     #     footnote = ' '.join(start.itertext())
455                     # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
456                     #     doc = add_part(snippets, header_index=position, header_type=header.tag,
457                     #                    content=footnote)
458
459                     #     self.index.addDocument(doc)
460
461                     #     footnote = None
462
463                     # handle fragments and themes.
464                     if start is not None and start.tag == 'begin':
465                         fid = start.attrib['id'][1:]
466                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
467
468                     elif start is not None and start.tag == 'motyw':
469                         fid = start.attrib['id'][1:]
470                         if start.text is not None:
471                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
472
473                     elif start is not None and start.tag == 'end':
474                         fid = start.attrib['id'][1:]
475                         if fid not in fragments:
476                             continue  # a broken <end> node, skip it
477                                       #                        import pdb; pdb.set_trace()
478                         frag = fragments[fid]
479                         if frag['themes'] == []:
480                             continue  # empty themes list.
481                         del fragments[fid]
482
483                         doc = add_part(snippets,
484                                        header_type=frag['start_header'],
485                                        header_index=frag['start_section'],
486                                        header_span=position - frag['start_section'] + 1,
487                                        fragment_anchor=fid,
488                                        content=fix_format(frag['content']),
489                                        themes=frag['themes'])
490
491                         self.index.addDocument(doc)
492
493                         # Collect content.
494                     elif start is not None:
495                         for frag in fragments.values():
496                             frag['content'].append(start.text)
497                         content.append(start.text)
498                     elif end is not None:
499                         for frag in fragments.values():
500                             frag['content'].append(end.tail)
501                         content.append(end.tail)
502
503                         # in the end, add a section text.
504                 doc = add_part(snippets, header_index=position, header_type=header.tag,
505                                content=fix_format(content))
506
507                 self.index.addDocument(doc)
508
509         finally:
510             snippets.close()
511
512
513 def log_exception_wrapper(f):
514     def _wrap(*a):
515         try:
516             f(*a)
517         except Exception, e:
518             print("Error in indexing thread: %s" % e)
519             traceback.print_exc()
520             raise e
521     return _wrap
522
523
524 class ReusableIndex(Index):
525     """
526     Works like index, but does not close/optimize Lucene index
527     until program exit (uses atexit hook).
528     This is usefull for importbooks command.
529
530     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
531     """
532     index = None
533
534     def open(self, analyzer=None, threads=4):
535         if ReusableIndex.index is not None:
536             self.index = ReusableIndex.index
537         else:
538             print("opening index")
539             Index.open(self, analyzer)
540             ReusableIndex.index = self.index
541             atexit.register(ReusableIndex.close_reusable)
542
543     # def index_book(self, *args, **kw):
544     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
545     #     ReusableIndex.pool_jobs.append(job)
546
547     @staticmethod
548     def close_reusable():
549         if ReusableIndex.index is not None:
550             ReusableIndex.index.optimize()
551             ReusableIndex.index.close()
552             ReusableIndex.index = None
553
554     def close(self):
555         pass
556
557
558 class JoinSearch(object):
559     """
560     This mixin could be used to handle block join queries.
561     (currently unused)
562     """
563     def __init__(self, *args, **kw):
564         super(JoinSearch, self).__init__(*args, **kw)
565
566     def wrapjoins(self, query, fields=[]):
567         """
568         This functions modifies the query in a recursive way,
569         so Term and Phrase Queries contained, which match
570         provided fields are wrapped in a BlockJoinQuery,
571         and so delegated to children documents.
572         """
573         if BooleanQuery.instance_(query):
574             qs = BooleanQuery.cast_(query)
575             for clause in qs:
576                 clause = BooleanClause.cast_(clause)
577                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
578             return qs
579         else:
580             termset = HashSet()
581             query.extractTerms(termset)
582             for t in termset:
583                 t = Term.cast_(t)
584                 if t.field() not in fields:
585                     return query
586             return BlockJoinQuery(query, self.parent_filter,
587                                   BlockJoinQuery.ScoreMode.Total)
588
589     def bsearch(self, query, max_results=50):
590         q = self.query(query)
591         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
592
593         tops = self.searcher.search(bjq, max_results)
594         bks = []
595         for found in tops.scoreDocs:
596             doc = self.searcher.doc(found.doc)
597             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
598         return (bks, tops.totalHits)
599
600
601 class SearchResult(object):
602     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
603         if tokens_cache is None: tokens_cache = {}
604
605         if score:
606             self._score = score
607         else:
608             self._score = scoreDocs.score
609
610         self.boost = 1.0
611
612         self._hits = []
613         self._processed_hits = None  # processed hits
614
615         stored = search.searcher.doc(scoreDocs.doc)
616         self.book_id = int(stored.get("book_id"))
617
618         pd = stored.get("published_date")
619         if pd is None:
620             pd = 0
621         self.published_date = int(pd)
622
623         header_type = stored.get("header_type")
624         # we have a content hit in some header of fragment
625         if header_type is not None:
626             sec = (header_type, int(stored.get("header_index")))
627             header_span = stored.get('header_span')
628             header_span = header_span is not None and int(header_span) or 1
629
630             fragment = stored.get("fragment_anchor")
631
632             if snippets:
633                 snippets = snippets.replace("/\n", "\n")
634             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
635
636             self._hits.append(hit)
637
638         self.search = search
639         self.searched = searched
640         self.tokens_cache = tokens_cache
641
642     @property
643     def score(self):
644         return self._score * self.boost
645
646     def merge(self, other):
647         if self.book_id != other.book_id:
648             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
649         self._hits += other._hits
650         if other.score > self.score:
651             self._score = other._score
652         return self
653
654     def get_book(self):
655         return catalogue.models.Book.objects.get(id=self.book_id)
656
657     book = property(get_book)
658
659     @property
660     def hits(self):
661         if self._processed_hits is not None:
662             return self._processed_hits
663
664         POSITION = 0
665         FRAGMENT = 1
666         POSITION_INDEX = 1
667         POSITION_SPAN = 2
668         SCORE = 2
669         OTHER = 3
670
671         # to sections and fragments
672         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
673         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
674         sect = filter(lambda s: 0 == len(filter(
675             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
676             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
677             frags)), sect)
678
679         hits = []
680
681         # remove duplicate fragments
682         fragments = {}
683         for f in frags:
684             fid = f[FRAGMENT]
685             if fid in fragments:
686                 if fragments[fid][SCORE] >= f[SCORE]:
687                     continue
688             fragments[fid] = f
689         frags = fragments.values()
690
691         # remove duplicate sections
692         sections = {}
693
694         for s in sect:
695             si = s[POSITION][POSITION_INDEX]
696             # skip existing
697             if si in sections:
698                 if sections[si]['score'] >= s[SCORE]:
699                     continue
700
701             m = {'score': s[SCORE],
702                  'section_number': s[POSITION][POSITION_INDEX] + 1,
703                  }
704             m.update(s[OTHER])
705             sections[si] = m
706
707         hits = sections.values()
708
709         for f in frags:
710             try:
711                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
712             except catalogue.models.Fragment.DoesNotExist:
713                 # stale index
714                 continue
715
716             # Figure out if we were searching for a token matching some word in theme name.
717             themes = frag.tags.filter(category='theme')
718             themes_hit = []
719             if self.searched is not None:
720                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
721                 for theme in themes:
722                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
723                     for t in tokens:
724                         if t in name_tokens:
725                             if not theme in themes_hit:
726                                 themes_hit.append(theme)
727                             break
728
729             m = {'score': f[SCORE],
730                  'fragment': frag,
731                  'section_number': f[POSITION][POSITION_INDEX] + 1,
732                  'themes': themes,
733                  'themes_hit': themes_hit
734                  }
735             m.update(f[OTHER])
736             hits.append(m)
737
738         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
739
740         self._processed_hits = hits
741
742         return hits
743
744     def __unicode__(self):
745         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
746
747     @staticmethod
748     def aggregate(*result_lists):
749         books = {}
750         for rl in result_lists:
751             for r in rl:
752                 if r.book_id in books:
753                     books[r.book_id].merge(r)
754                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
755                 else:
756                     books[r.book_id] = r
757         return books.values()
758
759     def __cmp__(self, other):
760         c = cmp(self.score, other.score)
761         if c == 0:
762             if not hasattr(other,'published_date') or not hasattr(self, 'published_date'):
763                 import pdb; pdb.set_trace()
764             # this is inverted, because earlier date is better
765             return cmp(other.published_date, self.published_date)
766         else:
767             return c
768
769
770 class Hint(object):
771     """
772     Given some hint information (information we already know about)
773     our search target - like author, title (specific book), epoch, genre, kind
774     we can narrow down search using filters.
775     """
776     def __init__(self, search):
777         """
778         Accepts a Searcher instance.
779         """
780         self.search = search
781         self.book_tags = {}
782         self.part_tags = []
783         self._books = []
784
785     def books(self, *books):
786         """
787         Give a hint that we search these books.
788         """
789         self._books = books
790
791     def tags(self, tags):
792         """
793         Give a hint that these Tag objects (a list of)
794         is necessary.
795         """
796         for t in tags:
797             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
798                 lst = self.book_tags.get(t.category, [])
799                 lst.append(t)
800                 self.book_tags[t.category] = lst
801             if t.category in ['theme', 'theme_pl']:
802                 self.part_tags.append(t)
803
804     def tag_filter(self, tags, field='tags'):
805         """
806         Given a lsit of tags and an optional field (but they are normally in tags field)
807         returns a filter accepting only books with specific tags.
808         """
809         q = BooleanQuery()
810
811         for tag in tags:
812             toks = self.search.get_tokens(tag.name, field=field)
813             tag_phrase = PhraseQuery()
814             for tok in toks:
815                 tag_phrase.add(Term(field, tok))
816             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
817
818         return QueryWrapperFilter(q)
819
820     def book_filter(self):
821         """
822         Filters using book tags (all tag kinds except a theme)
823         """
824         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
825         if tags:
826             return self.tag_filter(tags)
827         else:
828             return None
829
830     def part_filter(self):
831         """
832         This filter can be used to look for book parts.
833         It filters on book id and/or themes.
834         """
835         fs = []
836         if self.part_tags:
837             fs.append(self.tag_filter(self.part_tags, field='themes'))
838
839         if self._books != []:
840             bf = BooleanFilter()
841             for b in self._books:
842                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
843                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
844             fs.append(bf)
845
846         return Search.chain_filters(fs)
847
848     def should_search_for_book(self):
849         return self._books == []
850
851     def just_search_in(self, all):
852         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
853         some = []
854         for field in all:
855             if field == 'authors' and 'author' in self.book_tags:
856                 continue
857             if field == 'title' and self._books != []:
858                 continue
859             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
860                 continue
861             some.append(field)
862         return some
863
864
865 class Search(IndexStore):
866     """
867     Search facilities.
868     """
869     def __init__(self, default_field="content"):
870         IndexStore.__init__(self)
871         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
872         # self.analyzer = WLAnalyzer()
873         self.searcher = IndexSearcher(self.store, True)
874         self.parser = QueryParser(Version.LUCENE_34, default_field,
875                                   self.analyzer)
876
877         self.parent_filter = TermsFilter()
878         self.parent_filter.addTerm(Term("is_book", "true"))
879
880     def query(self, query):
881         """Parse query in default Lucene Syntax. (for humans)
882         """
883         return self.parser.parse(query)
884
885     def simple_search(self, query, max_results=50):
886         """Runs a query for books using lucene syntax. (for humans)
887         Returns (books, total_hits)
888         """
889
890         tops = self.searcher.search(self.query(query), max_results)
891         bks = []
892         for found in tops.scoreDocs:
893             doc = self.searcher.doc(found.doc)
894             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
895         return (bks, tops.totalHits)
896
897     def get_tokens(self, searched, field='content', cached=None):
898         """returns tokens analyzed by a proper (for a field) analyzer
899         argument can be: StringReader, string/unicode, or tokens. In the last case
900         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
901         """
902         if cached is not None and field in cached:
903             return cached[field]
904
905         if isinstance(searched, str) or isinstance(searched, unicode):
906             searched = StringReader(searched)
907         elif isinstance(searched, list):
908             return searched
909
910         searched.reset()
911         tokens = self.analyzer.reusableTokenStream(field, searched)
912         toks = []
913         while tokens.incrementToken():
914             cta = tokens.getAttribute(CharTermAttribute.class_)
915             toks.append(cta.toString())
916
917         if cached is not None:
918             cached[field] = toks
919
920         return toks
921
922     def fuzziness(self, fuzzy):
923         """Helper method to sanitize fuzziness"""
924         if not fuzzy:
925             return None
926         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
927             return fuzzy
928         else:
929             return 0.5
930
931     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
932         """
933         Return a PhraseQuery with a series of tokens.
934         """
935         if fuzzy:
936             phrase = MultiPhraseQuery()
937             for t in tokens:
938                 term = Term(field, t)
939                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
940                 fuzzterms = []
941
942                 while True:
943                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
944                     ft = fuzzterm.term()
945                     if ft:
946                         fuzzterms.append(ft)
947                     if not fuzzterm.next(): break
948                 if fuzzterms:
949                     phrase.add(JArray('object')(fuzzterms, Term))
950                 else:
951                     phrase.add(term)
952         else:
953             phrase = PhraseQuery()
954             phrase.setSlop(slop)
955             for t in tokens:
956                 term = Term(field, t)
957                 phrase.add(term)
958         return phrase
959
960     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
961         """
962         Returns term queries joined by boolean query.
963         modal - applies to boolean query
964         fuzzy - should the query by fuzzy.
965         """
966         q = BooleanQuery()
967         for t in tokens:
968             term = Term(field, t)
969             if fuzzy:
970                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
971             else:
972                 term = TermQuery(term)
973             q.add(BooleanClause(term, modal))
974         return q
975
976     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
977                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
978         if filters is None: filters = []
979         if tokens_cache is None: tokens_cache = {}
980
981         tokens = self.get_tokens(searched, field, cached=tokens_cache)
982
983         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
984         if book:
985             filters.append(self.term_filter(Term('is_book', 'true')))
986         top = self.searcher.search(query, self.chain_filters(filters), max_results)
987
988         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
989
990     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
991                     filters=None, tokens_cache=None, boost=None, snippets=True):
992         if filters is None: filters = []
993         if tokens_cache is None: tokens_cache = {}
994
995         if book:
996             filters.append(self.term_filter(Term('is_book', 'true')))
997
998         query = BooleanQuery()
999
1000         for fld in fields:
1001             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1002
1003             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1004                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1005
1006         top = self.searcher.search(query, self.chain_filters(filters), max_results)
1007
1008         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1009                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1010
1011     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1012         """
1013         Search for perfect book matches. Just see if the query matches with some author or title,
1014         taking hints into account.
1015         """
1016         fields_to_search = ['authors', 'title']
1017         only_in = None
1018         if hint:
1019             if not hint.should_search_for_book():
1020                 return []
1021             fields_to_search = hint.just_search_in(fields_to_search)
1022             only_in = hint.book_filter()
1023
1024         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1025
1026         books = []
1027         for q in qrys:
1028             top = self.searcher.search(q,
1029                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1030                 max_results)
1031             for found in top.scoreDocs:
1032                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1033         return books
1034
1035     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1036         fields_to_search = ['tags', 'authors', 'title']
1037
1038         only_in = None
1039         if hint:
1040             if not hint.should_search_for_book():
1041                 return []
1042             fields_to_search = hint.just_search_in(fields_to_search)
1043             only_in = hint.book_filter()
1044
1045         tokens = self.get_tokens(searched, field='SIMPLE')
1046
1047         q = BooleanQuery()
1048
1049         for fld in fields_to_search:
1050             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1051                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1052
1053         books = []
1054         top = self.searcher.search(q,
1055                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1056             max_results)
1057         for found in top.scoreDocs:
1058             books.append(SearchResult(self, found, how_found="search_book"))
1059
1060         return books
1061
1062     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1063         """
1064         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1065         some part/fragment of the book.
1066         """
1067         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1068
1069         flt = None
1070         if hint:
1071             flt = hint.part_filter()
1072
1073         books = []
1074         for q in qrys:
1075             top = self.searcher.search(q,
1076                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1077                                                            flt]),
1078                                        max_results)
1079             for found in top.scoreDocs:
1080                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1081
1082         return books
1083
1084     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1085         """
1086         Tries to use search terms to match different fields of book (or its parts).
1087         E.g. one word can be an author survey, another be a part of the title, and the rest
1088         are some words from third chapter.
1089         """
1090         if tokens_cache is None: tokens_cache = {}
1091         books = []
1092         only_in = None
1093
1094         if hint:
1095             only_in = hint.part_filter()
1096
1097         # content only query : themes x content
1098         q = BooleanQuery()
1099
1100         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1101         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1102
1103         # only search in themes when we do not already filter by themes
1104         if hint is None or hint.just_search_in(['themes']) != []:
1105             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1106                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1107
1108         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1109                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1110
1111         topDocs = self.searcher.search(q, only_in, max_results)
1112         for found in topDocs.scoreDocs:
1113             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1114             print "* %s theme x content: %s" % (searched, books[-1]._hits)
1115
1116         # query themes/content x author/title/tags
1117         q = BooleanQuery()
1118         in_content = BooleanQuery()
1119         in_meta = BooleanQuery()
1120
1121         for fld in ['themes_pl', 'content']:
1122             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1123
1124         for fld in ['tags', 'authors', 'title']:
1125             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1126
1127         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1128         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1129
1130         topDocs = self.searcher.search(q, only_in, max_results)
1131         for found in topDocs.scoreDocs:
1132             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1133             print "* %s scatter search: %s" % (searched, books[-1]._hits)
1134
1135         return books
1136
1137     # def multisearch(self, query, max_results=50):
1138     #     """
1139     #     Search strategy:
1140     #     - (phrase) OR -> content
1141     #                   -> title
1142     #                   -> authors
1143     #     - (keywords)  -> authors
1144     #                   -> motyw
1145     #                   -> tags
1146     #                   -> content
1147     #     """
1148         # queryreader = StringReader(query)
1149         # tokens = self.get_tokens(queryreader)
1150
1151         # top_level = BooleanQuery()
1152         # Should = BooleanClause.Occur.SHOULD
1153
1154         # phrase_level = BooleanQuery()
1155         # phrase_level.setBoost(1.3)
1156
1157         # p_content = self.make_phrase(tokens, joined=True)
1158         # p_title = self.make_phrase(tokens, 'title')
1159         # p_author = self.make_phrase(tokens, 'author')
1160
1161         # phrase_level.add(BooleanClause(p_content, Should))
1162         # phrase_level.add(BooleanClause(p_title, Should))
1163         # phrase_level.add(BooleanClause(p_author, Should))
1164
1165         # kw_level = BooleanQuery()
1166
1167         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1168         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1169         # kw_level.add(j_themes, Should)
1170         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1171         # j_con = self.make_term_query(tokens, joined=True)
1172         # kw_level.add(j_con, Should)
1173
1174         # top_level.add(BooleanClause(phrase_level, Should))
1175         # top_level.add(BooleanClause(kw_level, Should))
1176
1177         # return None
1178
1179     def get_snippets(self, scoreDoc, query, field='content'):
1180         """
1181         Returns a snippet for found scoreDoc.
1182         """
1183         htmlFormatter = SimpleHTMLFormatter()
1184         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1185
1186         stored = self.searcher.doc(scoreDoc.doc)
1187
1188         position = stored.get('snippets_position')
1189         length = stored.get('snippets_length')
1190         if position is None or length is None:
1191             return None
1192         # locate content.
1193         snippets = Snippets(stored.get('book_id')).open()
1194         try:
1195             text = snippets.get((int(position),
1196                                  int(length)))
1197         finally:
1198             snippets.close()
1199
1200         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1201         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1202         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1203
1204         return snip
1205
1206     @staticmethod
1207     def enum_to_array(enum):
1208         """
1209         Converts a lucene TermEnum to array of Terms, suitable for
1210         addition to queries
1211         """
1212         terms = []
1213
1214         while True:
1215             t = enum.term()
1216             if t:
1217                 terms.append(t)
1218             if not enum.next(): break
1219
1220         if terms:
1221             return JArray('object')(terms, Term)
1222
1223     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1224         """
1225         Search for Tag objects using query.
1226         """
1227         if not pdcounter:
1228             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1229         tops = self.searcher.search(query, filters, max_results)
1230
1231         tags = []
1232         for found in tops.scoreDocs:
1233             doc = self.searcher.doc(found.doc)
1234             is_pdcounter = doc.get('is_pdcounter')
1235             if is_pdcounter:
1236                 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1237             else:
1238                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1239                 # don't add the pdcounter tag if same tag already exists
1240             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1241                 tags.append(tag)
1242                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
1243         print 'returning %s' % tags
1244         return tags
1245
1246     def search_books(self, query, filter=None, max_results=10):
1247         """
1248         Searches for Book objects using query
1249         """
1250         bks = []
1251         tops = self.searcher.search(query, filter, max_results)
1252         for found in tops.scoreDocs:
1253             doc = self.searcher.doc(found.doc)
1254             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1255         return bks
1256
1257     def make_prefix_phrase(self, toks, field):
1258         q = MultiPhraseQuery()
1259         for i in range(len(toks)):
1260             t = Term(field, toks[i])
1261             if i == len(toks) - 1:
1262                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1263                 if pterms:
1264                     q.add(pterms)
1265                 else:
1266                     q.add(t)
1267             else:
1268                 q.add(t)
1269         return q
1270
1271     @staticmethod
1272     def term_filter(term, inverse=False):
1273         only_term = TermsFilter()
1274         only_term.addTerm(term)
1275
1276         if inverse:
1277             neg = BooleanFilter()
1278             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1279             only_term = neg
1280
1281         return only_term
1282
1283     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1284         """
1285         Return auto-complete hints for tags
1286         using prefix search.
1287         """
1288         toks = self.get_tokens(string, field='SIMPLE')
1289         top = BooleanQuery()
1290
1291         for field in ['tag_name', 'tag_name_pl']:
1292             if prefix:
1293                 q = self.make_prefix_phrase(toks, field)
1294             else:
1295                 q = self.make_term_query(toks, field)
1296             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1297
1298         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1299
1300         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1301
1302     def hint_books(self, string, max_results=50, prefix=True):
1303         """
1304         Returns auto-complete hints for book titles
1305         Because we do not index 'pseudo' title-tags.
1306         Prefix search.
1307         """
1308         toks = self.get_tokens(string, field='SIMPLE')
1309
1310         if prefix:
1311             q = self.make_prefix_phrase(toks, 'title')
1312         else:
1313             q = self.make_term_query(toks, 'title')
1314
1315         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1316
1317     @staticmethod
1318     def chain_filters(filters, op=ChainedFilter.AND):
1319         """
1320         Chains a filter list together
1321         """
1322         filters = filter(lambda x: x is not None, filters)
1323         if not filters or filters is []:
1324             return None
1325         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1326         return chf
1327
1328     def filtered_categories(self, tags):
1329         """
1330         Return a list of tag categories, present in tags list.
1331         """
1332         cats = {}
1333         for t in tags:
1334             cats[t.category] = True
1335         return cats.keys()
1336
1337     def hint(self):
1338         return Hint(self)