1cac82ef6f05f9afc575da00ed16e992458b6da7
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23
24 class SolrIndex(object):
25     def __init__(self, mode=None):
26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
27
28
29 class Snippets(object):
30     """
31     This class manages snippet files for indexed object (book)
32     the snippets are concatenated together, and their positions and
33     lengths are kept in lucene index fields.
34     """
35     SNIPPET_DIR = "snippets"
36
37     def __init__(self, book_id, revision=None):
38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         self.book_id = book_id
40         self.revision = revision
41         self.file = None
42         self.position = None
43
44     @property
45     def path(self):
46         if self.revision:
47             fn = "%d.%d" % (self.book_id, self.revision)
48         else:
49             fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if 'b' not in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         if self.file:
96             self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         log.debug("Indexing tags")
149         remove_only = kw.get('remove_only', False)
150         # first, remove tags from index.
151         if tags:
152             tag_qs = []
153             for tag in tags:
154                 q_id = self.index.Q(tag_id=tag.id)
155
156                 if isinstance(tag, PDCounterAuthor):
157                     q_cat = self.index.Q(tag_category='pd_author')
158                 elif isinstance(tag, PDCounterBook):
159                     q_cat = self.index.Q(tag_category='pd_book')
160                 else:
161                     q_cat = self.index.Q(tag_category=tag.category)
162
163                 q_id_cat = self.index.Q(q_id & q_cat)
164                 tag_qs.append(q_id_cat)
165             self.delete_query(*tag_qs)
166         else:  # all
167             q = self.index.Q(tag_id__any=True)
168             self.delete_query(q)
169
170         if not remove_only:
171             # then add them [all or just one passed]
172             if not tags:
173                 tags = chain(
174                     catalogue.models.Tag.objects.exclude(category='set'),
175                     PDCounterAuthor.objects.all(),
176                     PDCounterBook.objects.all())
177
178             for tag in tags:
179                 if isinstance(tag, PDCounterAuthor):
180                     doc = {
181                         "tag_id": int(tag.id),
182                         "tag_name": tag.name,
183                         "tag_name_pl": tag.name,
184                         "tag_category": 'pd_author',
185                         "is_pdcounter": True,
186                         "uid": "tag%d_pd_a" % tag.id
187                         }
188                 elif isinstance(tag, PDCounterBook):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.title,
192                         "tag_name_pl": tag.title,
193                         "tag_category": 'pd_book',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_b" % tag.id
196                         }
197                 else:
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.name,
201                         "tag_name_pl": tag.name,
202                         "tag_category": tag.category,
203                         "is_pdcounter": False,
204                         "uid": "tag%d" % tag.id
205                         }
206                 self.index.add(doc)
207
208     def create_book_doc(self, book):
209         """
210         Create a lucene document referring book id.
211         """
212         doc = {'book_id': int(book.id)}
213         if book.parent is not None:
214             doc['parent_id'] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244         # let's not index it - it's only used for extracting publish date
245         if 'source_name' in meta_fields:
246             del meta_fields['source_name']
247
248         for n, f in meta_fields.items():
249             book_doc[n] = f
250
251         book_doc['uid'] = "book%s" % book_doc['book_id']
252         self.index.add(book_doc)
253         del book_doc
254         book_fields = {
255             'title': meta_fields['title'],
256             'authors': meta_fields['authors'],
257             'published_date': meta_fields['published_date']
258             }
259
260         if 'translators' in meta_fields:
261             book_fields['translators'] = meta_fields['translators']
262
263         self.index_content(book, book_fields=book_fields)
264
265     master_tags = [
266         'opowiadanie',
267         'powiesc',
268         'dramat_wierszowany_l',
269         'dramat_wierszowany_lp',
270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
271         'wywiad',
272         ]
273
274     ignore_content_tags = [
275         'uwaga', 'extra',
276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
277         'didaskalia',
278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
279         ]
280
281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
282
283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
285
286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
287
288     def extract_metadata(self, book, book_info=None, dc_only=None):
289         """
290         Extract metadata from book and returns a map of fields keyed by fieldname
291         """
292         fields = {}
293
294         if book_info is None:
295             book_info = dcparser.parse(open(book.xml_file.path))
296
297         fields['slug'] = book.slug
298         fields['tags'] = [t.name for t in book.tags]
299         fields['is_book'] = True
300
301         # validator, name
302         for field in dcparser.BookInfo.FIELDS:
303             if dc_only and field.name not in dc_only:
304                 continue
305             if hasattr(book_info, field.name):
306                 if not getattr(book_info, field.name):
307                     continue
308                 # since no type information is available, we use validator
309                 type_indicator = field.validator
310                 if type_indicator == dcparser.as_unicode:
311                     s = getattr(book_info, field.name)
312                     if field.multiple:
313                         s = ', '.join(s)
314                     fields[field.name] = s
315                 elif type_indicator == dcparser.as_person:
316                     p = getattr(book_info, field.name)
317                     if isinstance(p, dcparser.Person):
318                         persons = unicode(p)
319                     else:
320                         persons = ', '.join(map(unicode, p))
321                     fields[field.name] = persons
322                 elif type_indicator == dcparser.as_date:
323                     dt = getattr(book_info, field.name)
324                     fields[field.name] = dt
325
326         # get published date
327         pd = None
328         if hasattr(book_info, 'source_name') and book_info.source_name:
329             match = self.published_date_re.search(book_info.source_name)
330             if match is not None:
331                 pd = str(match.groups()[0])
332         if not pd:
333             pd = ""
334         fields["published_date"] = pd
335
336         return fields
337
338     # def add_gaps(self, fields, fieldname):
339     #     """
340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
342     #     """
343     #     def gap():
344     #         while True:
345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
347
348     def get_master(self, root):
349         """
350         Returns the first master tag from an etree.
351         """
352         for master in root.iter():
353             if master.tag in self.master_tags:
354                 return master
355
356     def index_content(self, book, book_fields):
357         """
358         Walks the book XML and extract content from it.
359         Adds parts for each header tag and for each fragment.
360         """
361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362         root = wld.edoc.getroot()
363
364         master = self.get_master(root)
365         if master is None:
366             return []
367
368         def walker(node, ignore_tags=()):
369             if node.tag not in ignore_tags:
370                 yield node, None, None
371                 if node.text is not None:
372                     yield None, node.text, None
373                 for child in list(node):
374                     for b, t, e in walker(child):
375                         yield b, t, e
376                 yield None, None, node
377
378             if node.tail is not None:
379                 yield None, node.tail, None
380             return
381
382         def fix_format(text):
383             # separator = [u" ", u"\t", u".", u";", u","]
384             if isinstance(text, list):
385                 # need to join it first
386                 text = filter(lambda s: s is not None, content)
387                 text = u' '.join(text)
388                 # for i in range(len(text)):
389                 #     if i > 0:
390                 #         if text[i][0] not in separator\
391                 #             and text[i - 1][-1] not in separator:
392                 #          text.insert(i, u" ")
393
394             return re.sub("(?m)/$", "", text)
395
396         def add_part(snippets, **fields):
397             doc = self.create_book_doc(book)
398             for n, v in book_fields.items():
399                 doc[n] = v
400
401             doc['header_index'] = fields["header_index"]
402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403             doc['header_type'] = fields['header_type']
404
405             doc['text'] = fields['text']
406
407             # snippets
408             snip_pos = snippets.add(fields["text"])
409
410             doc['snippets_position'] = snip_pos[0]
411             doc['snippets_length'] = snip_pos[1]
412             if snippets.revision:
413                 doc["snippets_revision"] = snippets.revision
414
415             if 'fragment_anchor' in fields:
416                 doc["fragment_anchor"] = fields['fragment_anchor']
417
418             if 'themes' in fields:
419                 doc['themes'] = fields['themes']
420             doc['uid'] = "part%s-%s-%s-%s" % (
421                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
422             return doc
423
424         def give_me_utf8(s):
425             if isinstance(s, unicode):
426                 return s.encode('utf-8')
427             else:
428                 return s
429
430         fragments = {}
431         snippets = Snippets(book.id).open('w')
432         try:
433             for header, position in zip(list(master), range(len(master))):
434
435                 if header.tag in self.skip_header_tags:
436                     continue
437                 if header.tag is etree.Comment:
438                     continue
439
440                 # section content
441                 content = []
442                 footnote = []
443
444                 def all_content(text):
445                     for frag in fragments.values():
446                         frag['text'].append(text)
447                     content.append(text)
448                 handle_text = [all_content]
449
450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
451                     # handle footnotes
452                     if start is not None and start.tag in self.footnote_tags:
453                         footnote = []
454
455                         def collect_footnote(t):
456                             footnote.append(t)
457
458                         handle_text.append(collect_footnote)
459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
460                         handle_text.pop()
461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
462                                        text=u''.join(footnote),
463                                        is_footnote=True)
464                         self.index.add(doc)
465                         footnote = []
466
467                     # handle fragments and themes.
468                     if start is not None and start.tag == 'begin':
469                         fid = start.attrib['id'][1:]
470                         fragments[fid] = {
471                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
472
473                     # themes for this fragment
474                     elif start is not None and start.tag == 'motyw':
475                         fid = start.attrib['id'][1:]
476                         handle_text.append(lambda text: None)
477                         if start.text is not None:
478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
479                     elif end is not None and end.tag == 'motyw':
480                         handle_text.pop()
481
482                     elif start is not None and start.tag == 'end':
483                         fid = start.attrib['id'][1:]
484                         if fid not in fragments:
485                             continue  # a broken <end> node, skip it
486                         frag = fragments[fid]
487                         if not frag['themes']:
488                             continue  # empty themes list.
489                         del fragments[fid]
490
491                         doc = add_part(snippets,
492                                        header_type=frag['start_header'],
493                                        header_index=frag['start_section'],
494                                        header_span=position - frag['start_section'] + 1,
495                                        fragment_anchor=fid,
496                                        text=fix_format(frag['text']),
497                                        themes=frag['themes'])
498                         self.index.add(doc)
499
500                         # Collect content.
501
502                     if text is not None and handle_text is not []:
503                         hdl = handle_text[-1]
504                         hdl(text)
505
506                         # in the end, add a section text.
507                 doc = add_part(snippets, header_index=position,
508                                header_type=header.tag, text=fix_format(content))
509
510                 self.index.add(doc)
511
512         finally:
513             snippets.close()
514
515
516 class SearchResult(object):
517     def __init__(self, doc, how_found=None, query_terms=None):
518         self.boost = 1.0
519         self._hits = []
520         self._processed_hits = None  # processed hits
521         self.snippets = []
522         self.query_terms = query_terms
523         self._book = None
524
525         if 'score' in doc:
526             self._score = doc['score']
527         else:
528             self._score = 0
529
530         self.book_id = int(doc["book_id"])
531
532         try:
533             self.published_date = int(doc.get("published_date"))
534         except ValueError:
535             self.published_date = 0
536
537         # content hits
538         header_type = doc.get("header_type", None)
539         # we have a content hit in some header of fragment
540         if header_type is not None:
541             sec = (header_type, int(doc["header_index"]))
542             header_span = doc['header_span']
543             header_span = header_span is not None and int(header_span) or 1
544             fragment = doc.get("fragment_anchor", None)
545             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
546             snippets_rev = doc.get('snippets_revision', None)
547
548             hit = (sec + (header_span,), fragment, self._score, {
549                 'how_found': how_found,
550                 'snippets_pos': snippets_pos,
551                 'snippets_revision': snippets_rev,
552                 'themes': doc.get('themes', []),
553                 'themes_pl': doc.get('themes_pl', [])
554                 })
555
556             self._hits.append(hit)
557
558     def __unicode__(self):
559         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
560             (self.book_id, len(self._hits),
561              len(self._processed_hits) if self._processed_hits else -1,
562              self._score, len(self.snippets))
563
564     def __str__(self):
565         return unicode(self).encode('utf-8')
566
567     @property
568     def score(self):
569         return self._score * self.boost
570
571     def merge(self, other):
572         if self.book_id != other.book_id:
573             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
574         self._hits += other._hits
575         if other.score > self.score:
576             self._score = other._score
577         return self
578
579     def get_book(self):
580         if self._book is not None:
581             return self._book
582         self._book = catalogue.models.Book.objects.get(id=self.book_id)
583         return self._book
584
585     book = property(get_book)
586
587     POSITION = 0
588     FRAGMENT = 1
589     POSITION_INDEX = 1
590     POSITION_SPAN = 2
591     SCORE = 2
592     OTHER = 3
593
594     @property
595     def hits(self):
596         if self._processed_hits is not None:
597             return self._processed_hits
598
599         # to sections and fragments
600         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
601
602         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
603
604         # sections not covered by fragments
605         sect = filter(lambda s: 0 == len(filter(
606             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
607                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
608
609         def remove_duplicates(lst, keyfn, compare):
610             els = {}
611             for e in lst:
612                 eif = keyfn(e)
613                 if eif in els:
614                     if compare(els[eif], e) >= 1:
615                         continue
616                 els[eif] = e
617             return els.values()
618
619         # remove fragments with duplicated fid's and duplicated snippets
620         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
621         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
622         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
623
624         # remove duplicate sections
625         sections = {}
626
627         for s in sect:
628             si = s[self.POSITION][self.POSITION_INDEX]
629             # skip existing
630             if si in sections:
631                 if sections[si]['score'] >= s[self.SCORE]:
632                     continue
633
634             m = {'score': s[self.SCORE],
635                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
636                  }
637             m.update(s[self.OTHER])
638             sections[si] = m
639
640         hits = sections.values()
641
642         for f in frags:
643             try:
644                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
645             except catalogue.models.Fragment.DoesNotExist:
646                 # stale index
647                 continue
648             # Figure out if we were searching for a token matching some word in theme name.
649             themes = frag.tags.filter(category='theme')
650             themes_hit = set()
651             if self.query_terms is not None:
652                 for i in range(0, len(f[self.OTHER]['themes'])):
653                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
654                     tms = map(unicode.lower, tms)
655                     for qt in self.query_terms:
656                         if qt in tms:
657                             themes_hit.add(f[self.OTHER]['themes'][i])
658                             break
659
660             def theme_by_name(n):
661                 th = filter(lambda t: t.name == n, themes)
662                 if th:
663                     return th[0]
664                 else:
665                     return None
666             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
667
668             m = {'score': f[self.SCORE],
669                  'fragment': frag,
670                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
671                  'themes': themes,
672                  'themes_hit': themes_hit
673                  }
674             m.update(f[self.OTHER])
675             hits.append(m)
676
677         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
678
679         self._processed_hits = hits
680
681         return hits
682
683     @staticmethod
684     def aggregate(*result_lists):
685         books = {}
686         for rl in result_lists:
687             for r in rl:
688                 if r.book_id in books:
689                     books[r.book_id].merge(r)
690                 else:
691                     books[r.book_id] = r
692         return books.values()
693
694     def __cmp__(self, other):
695         c = cmp(self.score, other.score)
696         if c == 0:
697             # this is inverted, because earlier date is better
698             return cmp(other.published_date, self.published_date)
699         else:
700             return c
701
702     def __len__(self):
703         return len(self.hits)
704
705     def snippet_pos(self, idx=0):
706         return self.hits[idx]['snippets_pos']
707
708     def snippet_revision(self, idx=0):
709         try:
710             return self.hits[idx]['snippets_revision']
711         except (IndexError, KeyError):
712             return None
713
714
715 class Search(SolrIndex):
716     """
717     Search facilities.
718     """
719     def __init__(self, default_field="text"):
720         super(Search, self).__init__(mode='r')
721
722     def make_term_query(self, query, field='text', modal=operator.or_):
723         """
724         Returns term queries joined by boolean query.
725         modal - applies to boolean query
726         fuzzy - should the query by fuzzy.
727         """
728         if query is None:
729             query = ''
730         q = self.index.Q()
731         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
732
733         return q
734
735     def search_phrase(self, searched, field='text', book=False,
736                       filters=None,
737                       snippets=False):
738         if filters is None:
739             filters = []
740         if book:
741             filters.append(self.index.Q(is_book=True))
742
743         q = self.index.query(**{field: searched})
744         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
745         res = q.paginate(rows=100).execute()
746         return [SearchResult(found, how_found=u'search_phrase') for found in res]
747
748     def search_some(self, searched, fields, book=True,
749                     filters=None, snippets=True, query_terms=None):
750         assert isinstance(fields, list)
751         if filters is None:
752             filters = []
753         if book:
754             filters.append(self.index.Q(is_book=True))
755
756         query = self.index.Q()
757
758         for fld in fields:
759             query = self.index.Q(query | self.make_term_query(searched, fld))
760
761         query = self.index.query(query)
762         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
763         res = query.execute()
764         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
765
766     def search_everywhere(self, searched, query_terms=None):
767         """
768         Tries to use search terms to match different fields of book (or its parts).
769         E.g. one word can be an author survey, another be a part of the title, and the rest
770         are some words from third chapter.
771         """
772         books = []
773         # content only query : themes x content
774         q = self.make_term_query(searched, 'text')
775         q_themes = self.make_term_query(searched, 'themes_pl')
776
777         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
778         res = query.execute()
779
780         for found in res:
781             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
782
783         # query themes/content x author/title/tags
784         in_content = self.index.Q()
785         in_meta = self.index.Q()
786
787         for fld in ['themes_pl', 'text']:
788             in_content |= self.make_term_query(searched, field=fld)
789
790         for fld in ['tags', 'authors', 'title']:
791             in_meta |= self.make_term_query(searched, field=fld)
792
793         q = in_content & in_meta
794         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
795
796         for found in res:
797             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
798
799         return books
800
801     def get_snippets(self, searchresult, query, field='text', num=1):
802         """
803         Returns a snippet for found scoreDoc.
804         """
805         maxnum = len(searchresult)
806         if num is None or num < 0 or num > maxnum:
807             num = maxnum
808         book_id = searchresult.book_id
809         revision = searchresult.snippet_revision()
810         snippets = Snippets(book_id, revision=revision)
811         snips = [None] * maxnum
812         try:
813             snippets.open()
814             idx = 0
815             while idx < maxnum and num > 0:
816                 position, length = searchresult.snippet_pos(idx)
817                 if position is None or length is None:
818                     continue
819                 text = snippets.get((int(position),
820                                      int(length)))
821                 snip = self.index.highlight(text=text, field=field, q=query)
822                 snips[idx] = snip
823                 if snip:
824                     num -= 1
825                 idx += 1
826
827         except IOError, e:
828             book = catalogue.models.Book.objects.filter(id=book_id)
829             if not book:
830                 log.error("Book does not exist for book id = %d" % book_id)
831             elif not book.get().children.exists():
832                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
833             return []
834         finally:
835             snippets.close()
836
837             # remove verse end markers..
838         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
839
840         searchresult.snippets = snips
841
842         return snips
843
844     def hint_tags(self, query, pdcounter=True, prefix=True):
845         """
846         Return auto-complete hints for tags
847         using prefix search.
848         """
849         q = self.index.Q()
850         query = query.strip()
851         for field in ['tag_name', 'tag_name_pl']:
852             if prefix:
853                 q |= self.index.Q(**{field: query + "*"})
854             else:
855                 q |= self.make_term_query(query, field=field)
856         qu = self.index.query(q)
857
858         return self.search_tags(qu, pdcounter=pdcounter)
859
860     def search_tags(self, query, filters=None, pdcounter=False):
861         """
862         Search for Tag objects using query.
863         """
864         if not filters:
865             filters = []
866         if not pdcounter:
867             filters.append(~self.index.Q(is_pdcounter=True))
868         res = self.apply_filters(query, filters).execute()
869
870         tags = []
871         pd_tags = []
872
873         for doc in res:
874             is_pdcounter = doc.get('is_pdcounter', False)
875             category = doc.get('tag_category')
876             try:
877                 if is_pdcounter:
878                     if category == 'pd_author':
879                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
880                     elif category == 'pd_book':
881                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
882                         tag.category = 'pd_book'  # make it look more lik a tag.
883                     else:
884                         # WTF
885                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
886                             int(doc.get('tag_id')), category)).encode('utf-8')
887                     pd_tags.append(tag)
888                 else:
889                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
890                     tags.append(tag)
891
892             except catalogue.models.Tag.DoesNotExist:
893                 pass
894             except PDCounterAuthor.DoesNotExist:
895                 pass
896             except PDCounterBook.DoesNotExist:
897                 pass
898
899         tags_slugs = set(map(lambda t: t.slug, tags))
900         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
901
902         log.debug('search_tags: %s' % tags)
903
904         return tags
905
906     def hint_books(self, query, prefix=True):
907         """
908         Returns auto-complete hints for book titles
909         Because we do not index 'pseudo' title-tags.
910         Prefix search.
911         """
912         q = self.index.Q()
913         query = query.strip()
914         if prefix:
915             q |= self.index.Q(title=query + "*")
916             q |= self.index.Q(title_orig=query + "*")
917         else:
918             q |= self.make_term_query(query, field='title')
919             q |= self.make_term_query(query, field='title_orig')
920         qu = self.index.query(q)
921         only_books = self.index.Q(is_book=True)
922         return self.search_books(qu, [only_books])
923
924     def search_books(self, query, filters=None, max_results=10):
925         """
926         Searches for Book objects using query
927         """
928         bks = []
929         bks_found = set()
930         query = query.query(is_book=True)
931         res = self.apply_filters(query, filters).field_limit(['book_id'])
932         for r in res:
933             try:
934                 bid = r['book_id']
935                 if bid not in bks_found:
936                     bks.append(catalogue.models.Book.objects.get(id=bid))
937                     bks_found.add(bid)
938             except catalogue.models.Book.DoesNotExist:
939                 pass
940         return bks
941
942     @staticmethod
943     def apply_filters(query, filters):
944         """
945         Apply filters to a query
946         """
947         if filters is None:
948             filters = []
949         filters = filter(lambda x: x is not None, filters)
950         for f in filters:
951             query = query.query(f)
952         return query
953
954
955 if getattr(settings, 'SEARCH_MOCK', False):
956     from .mock_search import Search