e7f28c9da1224b8b00c4f2a5eb4a2b1534c798d7
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import highlight
19
20
21 class SolrIndex(object):
22     def __init__(self, mode=None):
23         self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
24
25
26 class Snippets(object):
27     """
28     This class manages snippet files for indexed object (book)
29     the snippets are concatenated together, and their positions and
30     lengths are kept in lucene index fields.
31     """
32     SNIPPET_DIR = "snippets"
33
34     def __init__(self, book_id, revision=None):
35         try:
36             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
37         except OSError as exc:
38             if exc.errno == errno.EEXIST:
39                 pass
40             else: raise
41         self.book_id = book_id
42         self.revision = revision
43         self.file = None
44
45     @property
46     def path(self):
47         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
48         else: fn = "%d" % self.book_id
49
50         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
51
52     def open(self, mode='r'):
53         """
54         Open the snippet file. Call .close() afterwards.
55         """
56         if not 'b' in mode:
57             mode += 'b'
58
59         if 'w' in mode:
60             if os.path.exists(self.path):
61                 self.revision = 1
62                 while True:
63                     if not os.path.exists(self.path):
64                         break
65                     self.revision += 1
66
67         self.file = open(self.path, mode)
68         self.position = 0
69         return self
70
71     def add(self, snippet):
72         """
73         Append a snippet (unicode) to the snippet file.
74         Return a (position, length) tuple
75         """
76         txt = snippet.encode('utf-8')
77         l = len(txt)
78         self.file.write(txt)
79         pos = (self.position, l)
80         self.position += l
81         return pos
82
83     def get(self, pos):
84         """
85         Given a tuple of (position, length) return an unicode
86         of the snippet stored there.
87         """
88         self.file.seek(pos[0], 0)
89         txt = self.file.read(pos[1]).decode('utf-8')
90         return txt
91
92     def close(self):
93         """Close snippet file"""
94         self.file.close()
95
96     def remove(self):
97         self.revision = None
98         try:
99             os.unlink(self.path)
100             self.revision = 0
101             while True:
102                 self.revision += 1
103                 os.unlink(self.path)
104         except OSError:
105             pass
106
107
108 class Index(SolrIndex):
109     """
110     Class indexing books.
111     """
112     def __init__(self):
113         super(Index, self).__init__()
114
115     def delete_query(self, *queries):
116         """
117         index.delete(queries=...) doesn't work, so let's reimplement it
118         using deletion of list of uids.
119         """
120         uids = set()
121         for q in queries:
122             if isinstance(q, sunburnt.search.LuceneQuery):
123                 q = self.index.query(q)
124             q.field_limiter.update(['uid'])
125             st = 0
126             rows = 100
127             while True:
128                 ids = q.paginate(start=st, rows=rows).execute()
129                 if not len(ids):
130                     break
131                 for res in ids:
132                     uids.add(res['uid'])
133                 st+=rows
134                 #        print "Will delete %s" % ','.join([x for x in uids])
135         if uids:
136             self.index.delete(uids)
137             return True
138         else:
139             return False
140
141     def index_tags(self, *tags, **kw):
142         """
143         Re-index global tag list.
144         Removes all tags from index, then index them again.
145         Indexed fields include: id, name (with and without polish stems), category
146         """
147         remove_only = kw.get('remove_only', False)
148         # first, remove tags from index.
149         if tags:
150             tag_qs = []
151             for tag in tags:
152                 q_id = self.index.Q(tag_id=tag.id)
153
154                 if isinstance(tag, PDCounterAuthor):
155                     q_cat = self.index.Q(tag_category='pd_author')
156                 elif isinstance(tag, PDCounterBook):
157                     q_cat = self.index.Q(tag_category='pd_book')
158                 else:
159                     q_cat = self.index.Q(tag_category=tag.category)
160
161                 q_id_cat = self.index.Q(q_id & q_cat)
162                 tag_qs.append(q_id_cat)
163             self.delete_query(tag_qs)
164         else:  # all
165             q = self.index.Q(tag_id__any=True)
166             self.delete_query(q)
167
168         if not remove_only:
169             # then add them [all or just one passed]
170             if not tags:
171                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
172                     PDCounterAuthor.objects.all(), \
173                     PDCounterBook.objects.all())
174
175             for tag in tags:
176                 if isinstance(tag, PDCounterAuthor):
177                     doc = {
178                         "tag_id": int(tag.id),
179                         "tag_name": tag.name,
180                         "tag_name_pl": tag.name,
181                         "tag_category": 'pd_author',
182                         "is_pdcounter": True
183                         }
184                 elif isinstance(tag, PDCounterBook):
185                     doc = {
186                         "tag_id": int(tag.id),
187                         "tag_name": tag.title,
188                         "tag_name_pl": tag.title,
189                         "tag_category": 'pd_book',
190                         "is_pdcounter": True
191                         }
192                 else:
193                     doc = {
194                         "tag_id": int(tag.id),
195                         "tag_name": tag.name,
196                         "tag_name_pl": tag.name,
197                         "tag_category": tag.category,
198                         "is_pdcounter": False
199                         }
200                 doc['uid'] = "tag%d" % tag.id
201                 self.index.add(doc)
202
203     def create_book_doc(self, book):
204         """
205         Create a lucene document referring book id.
206         """
207         doc = {
208             'book_id': int(book.id),
209             }
210         if book.parent is not None:
211             doc["parent_id"] = int(book.parent.id)
212         return doc
213
214     def remove_book(self, book_or_id, remove_snippets=True):
215         """Removes a book from search index.
216         book - Book instance."""
217         if isinstance(book_or_id, catalogue.models.Book):
218             book_id = book_or_id.id
219         else:
220             book_id = book_or_id
221
222         self.delete_query(self.index.Q(book_id=book_id))
223
224         if remove_snippets:
225             snippets = Snippets(book_id)
226             snippets.remove()
227
228     def index_book(self, book, book_info=None, overwrite=True):
229         """
230         Indexes the book.
231         Creates a lucene document for extracted metadata
232         and calls self.index_content() to index the contents of the book.
233         """
234         if overwrite:
235             # we don't remove snippets, since they might be still needed by
236             # threads using not reopened index
237             self.remove_book(book, remove_snippets=False)
238
239         book_doc = self.create_book_doc(book)
240         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
241         # let's not index it - it's only used for extracting publish date
242         if 'source_name' in meta_fields:
243             del meta_fields['source_name']
244
245         for n, f in meta_fields.items():
246             book_doc[n] = f
247
248         book_doc['uid'] = "book%s" % book_doc['book_id']
249         self.index.add(book_doc)
250         del book_doc
251
252         self.index_content(book, book_fields={
253             'title': meta_fields['title'],
254             'authors': meta_fields['authors'],
255             'published_date': meta_fields['published_date']})
256
257     master_tags = [
258         'opowiadanie',
259         'powiesc',
260         'dramat_wierszowany_l',
261         'dramat_wierszowany_lp',
262         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
263         'wywiad',
264         ]
265
266     ignore_content_tags = [
267         'uwaga', 'extra',
268         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
269         'didaskalia',
270         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
271         ]
272
273     footnote_tags = ['pa', 'pt', 'pr', 'pe']
274
275     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
276
277     published_date_re = re.compile("([0-9]+)[\]. ]*$")
278
279     def extract_metadata(self, book, book_info=None, dc_only=None):
280         """
281         Extract metadata from book and returns a map of fields keyed by fieldname
282         """
283         fields = {}
284
285         if book_info is None:
286             book_info = dcparser.parse(open(book.xml_file.path))
287
288         fields['slug'] = book.slug
289         fields['tags'] = [t.name  for t in book.tags]
290         fields['is_book'] = True
291
292         # validator, name
293         for field in dcparser.BookInfo.FIELDS:
294             if dc_only and field.name not in dc_only:
295                 continue
296             if hasattr(book_info, field.name):
297                 if not getattr(book_info, field.name):
298                     continue
299                 # since no type information is available, we use validator
300                 type_indicator = field.validator
301                 if type_indicator == dcparser.as_unicode:
302                     s = getattr(book_info, field.name)
303                     if field.multiple:
304                         s = ', '.join(s)
305                     fields[field.name] = s
306                 elif type_indicator == dcparser.as_person:
307                     p = getattr(book_info, field.name)
308                     if isinstance(p, dcparser.Person):
309                         persons = unicode(p)
310                     else:
311                         persons = ', '.join(map(unicode, p))
312                     fields[field.name] = persons
313                 elif type_indicator == dcparser.as_date:
314                     dt = getattr(book_info, field.name)
315                     fields[field.name] = dt
316
317         # get published date
318         pd = None
319         if hasattr(book_info, 'source_name') and book_info.source_name:
320             match = self.published_date_re.search(book_info.source_name)
321             if match is not None:
322                 pd = str(match.groups()[0])
323         if not pd: pd = ""
324         fields["published_date"] = pd
325
326         return fields
327
328     # def add_gaps(self, fields, fieldname):
329     #     """
330     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
331     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
332     #     """
333     #     def gap():
334     #         while True:
335     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
336     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
337
338     def get_master(self, root):
339         """
340         Returns the first master tag from an etree.
341         """
342         for master in root.iter():
343             if master.tag in self.master_tags:
344                 return master
345
346     def index_content(self, book, book_fields={}):
347         """
348         Walks the book XML and extract content from it.
349         Adds parts for each header tag and for each fragment.
350         """
351         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
352         root = wld.edoc.getroot()
353
354         master = self.get_master(root)
355         if master is None:
356             return []
357
358         def walker(node, ignore_tags=[]):
359
360             if node.tag not in ignore_tags:
361                 yield node, None, None
362                 if node.text is not None:
363                     yield None, node.text, None
364                 for child in list(node):
365                     for b, t, e in walker(child):
366                         yield b, t, e
367                 yield None, None, node
368
369             if node.tail is not None:
370                 yield None, node.tail, None
371             return
372
373         def fix_format(text):
374             #            separator = [u" ", u"\t", u".", u";", u","]
375             if isinstance(text, list):
376                 # need to join it first
377                 text = filter(lambda s: s is not None, content)
378                 text = u' '.join(text)
379                 # for i in range(len(text)):
380                 #     if i > 0:
381                 #         if text[i][0] not in separator\
382                 #             and text[i - 1][-1] not in separator:
383                 #          text.insert(i, u" ")
384
385             return re.sub("(?m)/$", "", text)
386
387         def add_part(snippets, **fields):
388             doc = self.create_book_doc(book)
389             for n, v in book_fields.items():
390                 doc[n] = v
391
392             doc['header_index'] = fields["header_index"]
393             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
394             doc['header_type'] = fields['header_type']
395
396             doc['text'] = fields['text']
397
398             # snippets
399             snip_pos = snippets.add(fields["text"])
400
401             doc['snippets_position'] = snip_pos[0]
402             doc['snippets_length'] = snip_pos[1]
403             if snippets.revision:
404                 doc["snippets_revision"] = snippets.revision
405
406             if 'fragment_anchor' in fields:
407                 doc["fragment_anchor"] = fields['fragment_anchor']
408
409             if 'themes' in fields:
410                 doc['themes'] = fields['themes']
411             doc['uid'] = "part%s%s%s" % (doc['header_index'],
412                                          doc['header_span'],
413                                          doc.get('fragment_anchor',''))
414             return doc
415
416         def give_me_utf8(s):
417             if isinstance(s, unicode):
418                 return s.encode('utf-8')
419             else:
420                 return s
421
422         fragments = {}
423         snippets = Snippets(book.id).open('w')
424         try:
425             for header, position in zip(list(master), range(len(master))):
426
427                 if header.tag in self.skip_header_tags:
428                     continue
429                 if header.tag is etree.Comment:
430                     continue
431
432                 # section content
433                 content = []
434                 footnote = []
435
436                 def all_content(text):
437                     for frag in fragments.values():
438                         frag['text'].append(text)
439                     content.append(text)
440                 handle_text = [all_content]
441
442                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
443                     # handle footnotes
444                     if start is not None and start.tag in self.footnote_tags:
445                         footnote = []
446
447                         def collect_footnote(t):
448                             footnote.append(t)
449
450                         handle_text.append(collect_footnote)
451                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
452                         handle_text.pop()
453                         doc = add_part(snippets, header_index=position, header_type=header.tag,
454                                        text=u''.join(footnote),
455                                        is_footnote=True)
456
457                         self.index.add(doc)
458                         #print "@ footnote text: %s" % footnote
459                         footnote = []
460
461                     # handle fragments and themes.
462                     if start is not None and start.tag == 'begin':
463                         fid = start.attrib['id'][1:]
464                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
465
466                     # themes for this fragment
467                     elif start is not None and start.tag == 'motyw':
468                         fid = start.attrib['id'][1:]
469                         handle_text.append(None)
470                         if start.text is not None:
471                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
472                     elif end is not None and end.tag == 'motyw':
473                         handle_text.pop()
474
475                     elif start is not None and start.tag == 'end':
476                         fid = start.attrib['id'][1:]
477                         if fid not in fragments:
478                             continue  # a broken <end> node, skip it
479                         frag = fragments[fid]
480                         if frag['themes'] == []:
481                             continue  # empty themes list.
482                         del fragments[fid]
483
484                         doc = add_part(snippets,
485                                        header_type=frag['start_header'],
486                                        header_index=frag['start_section'],
487                                        header_span=position - frag['start_section'] + 1,
488                                        fragment_anchor=fid,
489                                        text=fix_format(frag['text']),
490                                        themes=frag['themes'])
491                         #print '@ FRAG %s' % frag['content']
492                         self.index.add(doc)
493
494                         # Collect content.
495
496                     if text is not None and handle_text is not []:
497                         hdl = handle_text[-1]
498                         if hdl is not None:
499                             hdl(text)
500
501                         # in the end, add a section text.
502                 doc = add_part(snippets, header_index=position,
503                                header_type=header.tag, text=fix_format(content))
504                 #print '@ CONTENT: %s' % fix_format(content)
505
506                 self.index.add(doc)
507
508         finally:
509             snippets.close()
510
511
512
513 class SearchResult(object):
514     def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
515         if tokens_cache is None: tokens_cache = {}
516
517         if 'score' in doc:
518             self._score = doc['score']
519         else:
520             self._score = 0
521
522         self.boost = 1.0
523
524         self._hits = []
525         self._processed_hits = None  # processed hits
526
527         self.book_id = int(doc["book_id"])
528
529         pd = doc["published_date"]
530         try:
531             self.published_date = int(pd)
532         except ValueError:
533             self.published_date = 0
534
535         header_type = doc.get("header_type", None)
536         # we have a content hit in some header of fragment
537         if header_type is not None:
538             sec = (header_type, int(doc["header_index"]))
539             header_span = doc['header_span']
540             header_span = header_span is not None and int(header_span) or 1
541
542             fragment = doc.get("fragment_anchor", None)
543
544             if snippets:
545                 snippets = snippets.replace("/\n", "\n")
546             hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
547
548             self._hits.append(hit)
549
550         self.search = search
551         self.searched = searched
552         self.tokens_cache = tokens_cache
553
554     @property
555     def score(self):
556         return self._score * self.boost
557
558     def merge(self, other):
559         if self.book_id != other.book_id:
560             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
561         self._hits += other._hits
562         if other.score > self.score:
563             self._score = other._score
564         return self
565
566     def get_book(self):
567         if hasattr(self, '_book'):
568             return self._book
569         return catalogue.models.Book.objects.get(id=self.book_id)
570
571     book = property(get_book)
572
573     @property
574     def hits(self):
575         if self._processed_hits is not None:
576             return self._processed_hits
577
578         POSITION = 0
579         FRAGMENT = 1
580         POSITION_INDEX = 1
581         POSITION_SPAN = 2
582         SCORE = 2
583         OTHER = 3
584
585         # to sections and fragments
586         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
587
588         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
589
590         # sections not covered by fragments
591         sect = filter(lambda s: 0 == len(filter(
592             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
593             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
594             frags)), sect)
595
596         hits = []
597
598         def remove_duplicates(lst, keyfn, compare):
599             els = {}
600             for e in lst:
601                 eif = keyfn(e)
602                 if eif in els:
603                     if compare(els[eif], e) >= 1:
604                         continue
605                 els[eif] = e
606             return els.values()
607
608         # remove fragments with duplicated fid's and duplicated snippets
609         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
610         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
611                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
612
613         # remove duplicate sections
614         sections = {}
615
616         for s in sect:
617             si = s[POSITION][POSITION_INDEX]
618             # skip existing
619             if si in sections:
620                 if sections[si]['score'] >= s[SCORE]:
621                     continue
622
623             m = {'score': s[SCORE],
624                  'section_number': s[POSITION][POSITION_INDEX] + 1,
625                  }
626             m.update(s[OTHER])
627             sections[si] = m
628
629         hits = sections.values()
630
631         for f in frags:
632             try:
633                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
634             except catalogue.models.Fragment.DoesNotExist:
635                 # stale index
636                 continue
637
638             # Figure out if we were searching for a token matching some word in theme name.
639             themes = frag.tags.filter(category='theme')
640             themes_hit = []
641             if self.searched is not None:
642                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
643                 for theme in themes:
644                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
645                     for t in tokens:
646                         if t in name_tokens:
647                             if not theme in themes_hit:
648                                 themes_hit.append(theme)
649                             break
650
651             m = {'score': f[SCORE],
652                  'fragment': frag,
653                  'section_number': f[POSITION][POSITION_INDEX] + 1,
654                  'themes': themes,
655                  'themes_hit': themes_hit
656                  }
657             m.update(f[OTHER])
658             hits.append(m)
659
660         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
661
662         self._processed_hits = hits
663
664         return hits
665
666     def __unicode__(self):
667         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
668
669     @staticmethod
670     def aggregate(*result_lists):
671         books = {}
672         for rl in result_lists:
673             for r in rl:
674                 if r.book_id in books:
675                     books[r.book_id].merge(r)
676                 else:
677                     books[r.book_id] = r
678         return books.values()
679
680     def __cmp__(self, other):
681         c = cmp(self.score, other.score)
682         if c == 0:
683             # this is inverted, because earlier date is better
684             return cmp(other.published_date, self.published_date)
685         else:
686             return c
687
688
689 class Hint(object):
690     """
691     Given some hint information (information we already know about)
692     our search target - like author, title (specific book), epoch, genre, kind
693     we can narrow down search using filters.
694     """
695     def __init__(self, search):
696         """
697         Accepts a Searcher instance.
698         """
699         self.search = search
700         self.book_tags = {}
701         self.part_tags = []
702         self._books = []
703
704     def books(self, *books):
705         """
706         Give a hint that we search these books.
707         """
708         self._books = books
709
710     def tags(self, tags):
711         """
712         Give a hint that these Tag objects (a list of)
713         is necessary.
714         """
715         for t in tags:
716             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
717                 lst = self.book_tags.get(t.category, [])
718                 lst.append(t)
719                 self.book_tags[t.category] = lst
720             if t.category in ['theme', 'theme_pl']:
721                 self.part_tags.append(t)
722
723     def tag_filter(self, tags, field='tags'):
724         """
725         Given a lsit of tags and an optional field (but they are normally in tags field)
726         returns a filter accepting only books with specific tags.
727         """
728         q = BooleanQuery()
729
730         for tag in tags:
731             toks = self.search.get_tokens(tag.name, field=field)
732             tag_phrase = PhraseQuery()
733             for tok in toks:
734                 tag_phrase.add(Term(field, tok))
735             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
736
737         return QueryWrapperFilter(q)
738
739     def book_filter(self):
740         """
741         Filters using book tags (all tag kinds except a theme)
742         """
743         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
744         if tags:
745             return self.tag_filter(tags)
746         else:
747             return None
748
749     def part_filter(self):
750         """
751         This filter can be used to look for book parts.
752         It filters on book id and/or themes.
753         """
754         fs = []
755         if self.part_tags:
756             fs.append(self.tag_filter(self.part_tags, field='themes'))
757
758         if self._books != []:
759             bf = BooleanFilter()
760             for b in self._books:
761                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
762                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
763             fs.append(bf)
764
765         return Search.chain_filters(fs)
766
767     def should_search_for_book(self):
768         return self._books == []
769
770     def just_search_in(self, all):
771         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
772         some = []
773         for field in all:
774             if field == 'authors' and 'author' in self.book_tags:
775                 continue
776             if field == 'title' and self._books != []:
777                 continue
778             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
779                 continue
780             some.append(field)
781         return some
782
783
784 class Search(SolrIndex):
785     """
786     Search facilities.
787     """
788     def __init__(self, default_field="text"):
789         IndexStore.__init__(self)
790         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
791         # self.analyzer = WLAnalyzer()
792         reader = IndexReader.open(self.store, True)
793         self.searcher = IndexSearcher(reader)
794         self.parser = QueryParser(Version.LUCENE_34, default_field,
795                                   self.analyzer)
796
797         self.parent_filter = TermsFilter()
798         self.parent_filter.addTerm(Term("is_book", "true"))
799         index_changed.connect(self.reopen)
800
801     def close(self):
802         reader = self.searcher.getIndexReader()
803         self.searcher.close()
804         reader.close()
805         super(Search, self).close()
806         index_changed.disconnect(self.reopen)
807
808     def reopen(self, **unused):
809         reader = self.searcher.getIndexReader()
810         rdr = reader.reopen()
811         if not rdr.equals(reader):
812             log.debug('Reopening index')
813             oldsearch = self.searcher
814             self.searcher = IndexSearcher(rdr)
815             oldsearch.close()
816             reader.close()
817
818     def query(self, query):
819         """Parse query in default Lucene Syntax. (for humans)
820         """
821         return self.parser.parse(query)
822
823     def simple_search(self, query, max_results=50):
824         """Runs a query for books using lucene syntax. (for humans)
825         Returns (books, total_hits)
826         """
827
828         tops = self.searcher.search(self.query(query), max_results)
829         bks = []
830         for found in tops.scoreDocs:
831             doc = self.searcher.doc(found.doc)
832             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
833         return (bks, tops.totalHits)
834
835     def get_tokens(self, searched, field='text', cached=None):
836         """returns tokens analyzed by a proper (for a field) analyzer
837         argument can be: StringReader, string/unicode, or tokens. In the last case
838         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
839         """
840         if cached is not None and field in cached:
841             return cached[field]
842
843         if isinstance(searched, str) or isinstance(searched, unicode):
844             searched = StringReader(searched)
845         elif isinstance(searched, list):
846             return searched
847
848         searched.reset()
849         tokens = self.analyzer.reusableTokenStream(field, searched)
850         toks = []
851         while tokens.incrementToken():
852             cta = tokens.getAttribute(CharTermAttribute.class_)
853             toks.append(cta.toString())
854
855         if cached is not None:
856             cached[field] = toks
857
858         return toks
859
860     @staticmethod
861     def fuzziness(fuzzy):
862         """Helper method to sanitize fuzziness"""
863         if not fuzzy:
864             return None
865         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
866             return fuzzy
867         else:
868             return 0.5
869
870     def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
871         """
872         Return a PhraseQuery with a series of tokens.
873         """
874         if fuzzy:
875             phrase = MultiPhraseQuery()
876             for t in tokens:
877                 term = Term(field, t)
878                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
879                 fuzzterms = []
880
881                 while True:
882                     ft = fuzzterm.term()
883                     if ft:
884                         fuzzterms.append(ft)
885                     if not fuzzterm.next(): break
886                 if fuzzterms:
887                     phrase.add(JArray('object')(fuzzterms, Term))
888                 else:
889                     phrase.add(term)
890         else:
891             phrase = PhraseQuery()
892             phrase.setSlop(slop)
893             for t in tokens:
894                 term = Term(field, t)
895                 phrase.add(term)
896         return phrase
897
898     @staticmethod
899     def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
900         """
901         Returns term queries joined by boolean query.
902         modal - applies to boolean query
903         fuzzy - should the query by fuzzy.
904         """
905         q = BooleanQuery()
906         for t in tokens:
907             term = Term(field, t)
908             if fuzzy:
909                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
910             else:
911                 term = TermQuery(term)
912             q.add(BooleanClause(term, modal))
913         return q
914
915     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
916                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
917         if filters is None: filters = []
918         if tokens_cache is None: tokens_cache = {}
919
920         tokens = self.get_tokens(searched, field, cached=tokens_cache)
921
922         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
923         if book:
924             filters.append(self.term_filter(Term('is_book', 'true')))
925         top = self.searcher.search(query, self.chain_filters(filters), max_results)
926
927         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
928
929     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
930                     filters=None, tokens_cache=None, boost=None, snippets=True):
931         if filters is None: filters = []
932         if tokens_cache is None: tokens_cache = {}
933
934         if book:
935             filters.append(self.term_filter(Term('is_book', 'true')))
936
937         query = BooleanQuery()
938
939         for fld in fields:
940             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
941
942             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
943                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
944
945         top = self.searcher.search(query, self.chain_filters(filters), max_results)
946
947         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
948                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
949
950     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
951         """
952         Search for perfect book matches. Just see if the query matches with some author or title,
953         taking hints into account.
954         """
955         fields_to_search = ['authors', 'title']
956         only_in = None
957         if hint:
958             if not hint.should_search_for_book():
959                 return []
960             fields_to_search = hint.just_search_in(fields_to_search)
961             only_in = hint.book_filter()
962
963         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
964
965         books = []
966         for q in qrys:
967             top = self.searcher.search(q,
968                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
969                 max_results)
970             for found in top.scoreDocs:
971                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
972         return books
973
974     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
975         fields_to_search = ['tags', 'authors', 'title']
976
977         only_in = None
978         if hint:
979             if not hint.should_search_for_book():
980                 return []
981             fields_to_search = hint.just_search_in(fields_to_search)
982             only_in = hint.book_filter()
983
984         tokens = self.get_tokens(searched, field='SIMPLE')
985
986         q = BooleanQuery()
987
988         for fld in fields_to_search:
989             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
990                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
991
992         books = []
993         top = self.searcher.search(q,
994                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
995             max_results)
996         for found in top.scoreDocs:
997             books.append(SearchResult(self, found, how_found="search_book"))
998
999         return books
1000
1001     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1002         """
1003         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1004         some part/fragment of the book.
1005         """
1006         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
1007
1008         flt = None
1009         if hint:
1010             flt = hint.part_filter()
1011
1012         books = []
1013         for q in qrys:
1014             top = self.searcher.search(q,
1015                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1016                                                            flt]),
1017                                        max_results)
1018             for found in top.scoreDocs:
1019                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1020
1021         return books
1022
1023     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1024         """
1025         Tries to use search terms to match different fields of book (or its parts).
1026         E.g. one word can be an author survey, another be a part of the title, and the rest
1027         are some words from third chapter.
1028         """
1029         if tokens_cache is None: tokens_cache = {}
1030         books = []
1031         only_in = None
1032
1033         if hint:
1034             only_in = hint.part_filter()
1035
1036         # content only query : themes x content
1037         q = BooleanQuery()
1038
1039         tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
1040         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1041
1042         # only search in themes when we do not already filter by themes
1043         if hint is None or hint.just_search_in(['themes']) != []:
1044             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1045                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1046
1047         q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
1048                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1049
1050         topDocs = self.searcher.search(q, only_in, max_results)
1051         for found in topDocs.scoreDocs:
1052             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1053
1054         # query themes/content x author/title/tags
1055         q = BooleanQuery()
1056         in_content = BooleanQuery()
1057         in_meta = BooleanQuery()
1058
1059         for fld in ['themes_pl', 'text']:
1060             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1061
1062         for fld in ['tags', 'authors', 'title']:
1063             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1064
1065         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1066         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1067
1068         topDocs = self.searcher.search(q, only_in, max_results)
1069         for found in topDocs.scoreDocs:
1070             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1071
1072         return books
1073
1074     # def multisearch(self, query, max_results=50):
1075     #     """
1076     #     Search strategy:
1077     #     - (phrase) OR -> content
1078     #                   -> title
1079     #                   -> authors
1080     #     - (keywords)  -> authors
1081     #                   -> motyw
1082     #                   -> tags
1083     #                   -> content
1084     #     """
1085         # queryreader = StringReader(query)
1086         # tokens = self.get_tokens(queryreader)
1087
1088         # top_level = BooleanQuery()
1089         # Should = BooleanClause.Occur.SHOULD
1090
1091         # phrase_level = BooleanQuery()
1092         # phrase_level.setBoost(1.3)
1093
1094         # p_content = self.make_phrase(tokens, joined=True)
1095         # p_title = self.make_phrase(tokens, 'title')
1096         # p_author = self.make_phrase(tokens, 'author')
1097
1098         # phrase_level.add(BooleanClause(p_content, Should))
1099         # phrase_level.add(BooleanClause(p_title, Should))
1100         # phrase_level.add(BooleanClause(p_author, Should))
1101
1102         # kw_level = BooleanQuery()
1103
1104         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1105         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1106         # kw_level.add(j_themes, Should)
1107         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1108         # j_con = self.make_term_query(tokens, joined=True)
1109         # kw_level.add(j_con, Should)
1110
1111         # top_level.add(BooleanClause(phrase_level, Should))
1112         # top_level.add(BooleanClause(kw_level, Should))
1113
1114         # return None
1115
1116     def get_snippets(self, scoreDoc, query, field='text'):
1117         """
1118         Returns a snippet for found scoreDoc.
1119         """
1120         htmlFormatter = SimpleHTMLFormatter()
1121         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1122
1123         stored = self.searcher.doc(scoreDoc.doc)
1124
1125         position = stored.get('snippets_position')
1126         length = stored.get('snippets_length')
1127         if position is None or length is None:
1128             return None
1129         revision = stored.get('snippets_revision')
1130         if revision: revision = int(revision)
1131
1132         # locate content.
1133         book_id = int(stored.get('book_id'))
1134         snippets = Snippets(book_id, revision=revision)
1135
1136         try:
1137             snippets.open()
1138         except IOError, e:
1139             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1140             return []
1141
1142         try:
1143             try:
1144                 text = snippets.get((int(position),
1145                                      int(length)))
1146             finally:
1147                 snippets.close()
1148
1149             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1150             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1151             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1152
1153         except Exception, e:
1154             e2 = e
1155             if hasattr(e, 'getJavaException'):
1156                 e2 = unicode(e.getJavaException())
1157             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1158                 e2)
1159         return snip
1160
1161     @staticmethod
1162     def enum_to_array(enum):
1163         """
1164         Converts a lucene TermEnum to array of Terms, suitable for
1165         addition to queries
1166         """
1167         terms = []
1168
1169         while True:
1170             t = enum.term()
1171             if t:
1172                 terms.append(t)
1173             if not enum.next(): break
1174
1175         if terms:
1176             return JArray('object')(terms, Term)
1177
1178     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1179         """
1180         Search for Tag objects using query.
1181         """
1182         if not pdcounter:
1183             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1184         tops = self.searcher.search(query, filt, max_results)
1185
1186         tags = []
1187         for found in tops.scoreDocs:
1188             doc = self.searcher.doc(found.doc)
1189             is_pdcounter = doc.get('is_pdcounter')
1190             category = doc.get('tag_category')
1191             try:
1192                 if is_pdcounter == 'true':
1193                     if category == 'pd_author':
1194                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1195                     elif category == 'pd_book':
1196                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1197                         tag.category = 'pd_book'  # make it look more lik a tag.
1198                     else:
1199                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1200                 else:
1201                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1202                     # don't add the pdcounter tag if same tag already exists
1203
1204                 tags.append(tag)
1205
1206             except catalogue.models.Tag.DoesNotExist: pass
1207             except PDCounterAuthor.DoesNotExist: pass
1208             except PDCounterBook.DoesNotExist: pass
1209
1210         log.debug('search_tags: %s' % tags)
1211
1212         return tags
1213
1214     def search_books(self, query, filt=None, max_results=10):
1215         """
1216         Searches for Book objects using query
1217         """
1218         bks = []
1219         tops = self.searcher.search(query, filt, max_results)
1220         for found in tops.scoreDocs:
1221             doc = self.searcher.doc(found.doc)
1222             try:
1223                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1224             except catalogue.models.Book.DoesNotExist: pass
1225         return bks
1226
1227     def make_prefix_phrase(self, toks, field):
1228         q = MultiPhraseQuery()
1229         for i in range(len(toks)):
1230             t = Term(field, toks[i])
1231             if i == len(toks) - 1:
1232                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1233                 if pterms:
1234                     q.add(pterms)
1235                 else:
1236                     q.add(t)
1237             else:
1238                 q.add(t)
1239         return q
1240
1241     @staticmethod
1242     def term_filter(term, inverse=False):
1243         only_term = TermsFilter()
1244         only_term.addTerm(term)
1245
1246         if inverse:
1247             neg = BooleanFilter()
1248             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1249             only_term = neg
1250
1251         return only_term
1252
1253     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1254         """
1255         Return auto-complete hints for tags
1256         using prefix search.
1257         """
1258         toks = self.get_tokens(string, field='SIMPLE')
1259         top = BooleanQuery()
1260
1261         for field in ['tag_name', 'tag_name_pl']:
1262             if prefix:
1263                 q = self.make_prefix_phrase(toks, field)
1264             else:
1265                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1266             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1267
1268         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1269
1270         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1271
1272     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1273         """
1274         Returns auto-complete hints for book titles
1275         Because we do not index 'pseudo' title-tags.
1276         Prefix search.
1277         """
1278         toks = self.get_tokens(string, field='SIMPLE')
1279
1280         if prefix:
1281             q = self.make_prefix_phrase(toks, 'title')
1282         else:
1283             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1284
1285         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1286
1287     @staticmethod
1288     def chain_filters(filters, op='XXXChainedFilter.AND'):
1289         """
1290         Chains a filter list together
1291         """
1292         filters = filter(lambda x: x is not None, filters)
1293         if not filters or filters is []:
1294             return None
1295         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1296         return chf
1297
1298     def filtered_categories(self, tags):
1299         """
1300         Return a list of tag categories, present in tags list.
1301         """
1302         cats = {}
1303         for t in tags:
1304             cats[t.category] = True
1305         return cats.keys()
1306
1307     def hint(self):
1308         return Hint(self)