7dfe6ef5a131c3f22bc48c7b7965e77506cc9d68
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23
24 class SolrIndex(object):
25     def __init__(self, mode=None):
26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
27
28
29 class Snippets(object):
30     """
31     This class manages snippet files for indexed object (book)
32     the snippets are concatenated together, and their positions and
33     lengths are kept in lucene index fields.
34     """
35     SNIPPET_DIR = "snippets"
36
37     def __init__(self, book_id, revision=None):
38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         self.book_id = book_id
40         self.revision = revision
41         self.file = None
42         self.position = None
43
44     @property
45     def path(self):
46         if self.revision:
47             fn = "%d.%d" % (self.book_id, self.revision)
48         else:
49             fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if 'b' not in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         if self.file:
96             self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         log.debug("Indexing tags")
149         remove_only = kw.get('remove_only', False)
150         # first, remove tags from index.
151         if tags:
152             tag_qs = []
153             for tag in tags:
154                 q_id = self.index.Q(tag_id=tag.id)
155
156                 if isinstance(tag, PDCounterAuthor):
157                     q_cat = self.index.Q(tag_category='pd_author')
158                 elif isinstance(tag, PDCounterBook):
159                     q_cat = self.index.Q(tag_category='pd_book')
160                 else:
161                     q_cat = self.index.Q(tag_category=tag.category)
162
163                 q_id_cat = self.index.Q(q_id & q_cat)
164                 tag_qs.append(q_id_cat)
165             self.delete_query(*tag_qs)
166         else:  # all
167             q = self.index.Q(tag_id__any=True)
168             self.delete_query(q)
169
170         if not remove_only:
171             # then add them [all or just one passed]
172             if not tags:
173                 tags = chain(
174                     catalogue.models.Tag.objects.exclude(category='set'),
175                     PDCounterAuthor.objects.all(),
176                     PDCounterBook.objects.all())
177
178             for tag in tags:
179                 if isinstance(tag, PDCounterAuthor):
180                     doc = {
181                         "tag_id": int(tag.id),
182                         "tag_name": tag.name,
183                         "tag_name_pl": tag.name,
184                         "tag_category": 'pd_author',
185                         "is_pdcounter": True,
186                         "uid": "tag%d_pd_a" % tag.id
187                         }
188                 elif isinstance(tag, PDCounterBook):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.title,
192                         "tag_name_pl": tag.title,
193                         "tag_category": 'pd_book',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_b" % tag.id
196                         }
197                 else:
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.name,
201                         "tag_name_pl": tag.name,
202                         "tag_category": tag.category,
203                         "is_pdcounter": False,
204                         "uid": "tag%d" % tag.id
205                         }
206                 self.index.add(doc)
207
208     def create_book_doc(self, book):
209         """
210         Create a lucene document referring book id.
211         """
212         doc = {'book_id': int(book.id)}
213         if book.parent is not None:
214             doc['parent_id'] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244         # let's not index it - it's only used for extracting publish date
245         if 'source_name' in meta_fields:
246             del meta_fields['source_name']
247
248         for n, f in meta_fields.items():
249             book_doc[n] = f
250
251         book_doc['uid'] = "book%s" % book_doc['book_id']
252         self.index.add(book_doc)
253         del book_doc
254         book_fields = {
255             'title': meta_fields['title'],
256             'authors': meta_fields['authors'],
257             'published_date': meta_fields['published_date']
258             }
259
260         if 'translators' in meta_fields:
261             book_fields['translators'] = meta_fields['translators']
262
263         self.index_content(book, book_fields=book_fields)
264
265     master_tags = [
266         'opowiadanie',
267         'powiesc',
268         'dramat_wierszowany_l',
269         'dramat_wierszowany_lp',
270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
271         'wywiad',
272         ]
273
274     ignore_content_tags = [
275         'uwaga', 'extra',
276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
277         'didaskalia',
278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
279         ]
280
281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
282
283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
285
286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
287
288     def extract_metadata(self, book, book_info=None, dc_only=None):
289         """
290         Extract metadata from book and returns a map of fields keyed by fieldname
291         """
292         fields = {}
293
294         if book_info is None:
295             book_info = dcparser.parse(open(book.xml_file.path))
296
297         fields['slug'] = book.slug
298         fields['tags'] = [t.name for t in book.tags]
299         fields['is_book'] = True
300
301         # validator, name
302         for field in dcparser.BookInfo.FIELDS:
303             if dc_only and field.name not in dc_only:
304                 continue
305             if hasattr(book_info, field.name):
306                 if not getattr(book_info, field.name):
307                     continue
308                 # since no type information is available, we use validator
309                 type_indicator = field.validator
310                 if type_indicator == dcparser.as_unicode:
311                     s = getattr(book_info, field.name)
312                     if field.multiple:
313                         s = ', '.join(s)
314                     fields[field.name] = s
315                 elif type_indicator == dcparser.as_person:
316                     p = getattr(book_info, field.name)
317                     if isinstance(p, dcparser.Person):
318                         persons = unicode(p)
319                     else:
320                         persons = ', '.join(map(unicode, p))
321                     fields[field.name] = persons
322                 elif type_indicator == dcparser.as_date:
323                     dt = getattr(book_info, field.name)
324                     fields[field.name] = dt
325
326         # get published date
327         pd = None
328         if hasattr(book_info, 'source_name') and book_info.source_name:
329             match = self.published_date_re.search(book_info.source_name)
330             if match is not None:
331                 pd = str(match.groups()[0])
332         if not pd:
333             pd = ""
334         fields["published_date"] = pd
335
336         return fields
337
338     # def add_gaps(self, fields, fieldname):
339     #     """
340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
342     #     """
343     #     def gap():
344     #         while True:
345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
347
348     def get_master(self, root):
349         """
350         Returns the first master tag from an etree.
351         """
352         for master in root.iter():
353             if master.tag in self.master_tags:
354                 return master
355
356     def index_content(self, book, book_fields):
357         """
358         Walks the book XML and extract content from it.
359         Adds parts for each header tag and for each fragment.
360         """
361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362         root = wld.edoc.getroot()
363
364         master = self.get_master(root)
365         if master is None:
366             return []
367
368         def walker(node, ignore_tags=()):
369             if node.tag not in ignore_tags:
370                 yield node, None, None
371                 if node.text is not None:
372                     yield None, node.text, None
373                 for child in list(node):
374                     for b, t, e in walker(child):
375                         yield b, t, e
376                 yield None, None, node
377
378             if node.tail is not None:
379                 yield None, node.tail, None
380             return
381
382         def fix_format(text):
383             # separator = [u" ", u"\t", u".", u";", u","]
384             if isinstance(text, list):
385                 # need to join it first
386                 text = filter(lambda s: s is not None, content)
387                 text = u' '.join(text)
388                 # for i in range(len(text)):
389                 #     if i > 0:
390                 #         if text[i][0] not in separator\
391                 #             and text[i - 1][-1] not in separator:
392                 #          text.insert(i, u" ")
393
394             return re.sub("(?m)/$", "", text)
395
396         def add_part(snippets, **fields):
397             doc = self.create_book_doc(book)
398             for n, v in book_fields.items():
399                 doc[n] = v
400
401             doc['header_index'] = fields["header_index"]
402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403             doc['header_type'] = fields['header_type']
404
405             doc['text'] = fields['text']
406
407             # snippets
408             snip_pos = snippets.add(fields["text"])
409
410             doc['snippets_position'] = snip_pos[0]
411             doc['snippets_length'] = snip_pos[1]
412             if snippets.revision:
413                 doc["snippets_revision"] = snippets.revision
414
415             if 'fragment_anchor' in fields:
416                 doc["fragment_anchor"] = fields['fragment_anchor']
417
418             if 'themes' in fields:
419                 doc['themes'] = fields['themes']
420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
421                                          doc['header_span'],
422                                          doc.get('fragment_anchor', ''))
423             return doc
424
425         def give_me_utf8(s):
426             if isinstance(s, unicode):
427                 return s.encode('utf-8')
428             else:
429                 return s
430
431         fragments = {}
432         snippets = Snippets(book.id).open('w')
433         try:
434             for header, position in zip(list(master), range(len(master))):
435
436                 if header.tag in self.skip_header_tags:
437                     continue
438                 if header.tag is etree.Comment:
439                     continue
440
441                 # section content
442                 content = []
443                 footnote = []
444
445                 def all_content(text):
446                     for frag in fragments.values():
447                         frag['text'].append(text)
448                     content.append(text)
449                 handle_text = [all_content]
450
451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
452                     # handle footnotes
453                     if start is not None and start.tag in self.footnote_tags:
454                         footnote = []
455
456                         def collect_footnote(t):
457                             footnote.append(t)
458
459                         handle_text.append(collect_footnote)
460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
461                         handle_text.pop()
462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
463                                        text=u''.join(footnote),
464                                        is_footnote=True)
465                         self.index.add(doc)
466                         footnote = []
467
468                     # handle fragments and themes.
469                     if start is not None and start.tag == 'begin':
470                         fid = start.attrib['id'][1:]
471                         fragments[fid] = {
472                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
473
474                     # themes for this fragment
475                     elif start is not None and start.tag == 'motyw':
476                         fid = start.attrib['id'][1:]
477                         handle_text.append(lambda text: None)
478                         if start.text is not None:
479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
480                     elif end is not None and end.tag == 'motyw':
481                         handle_text.pop()
482
483                     elif start is not None and start.tag == 'end':
484                         fid = start.attrib['id'][1:]
485                         if fid not in fragments:
486                             continue  # a broken <end> node, skip it
487                         frag = fragments[fid]
488                         if not frag['themes']:
489                             continue  # empty themes list.
490                         del fragments[fid]
491
492                         doc = add_part(snippets,
493                                        header_type=frag['start_header'],
494                                        header_index=frag['start_section'],
495                                        header_span=position - frag['start_section'] + 1,
496                                        fragment_anchor=fid,
497                                        text=fix_format(frag['text']),
498                                        themes=frag['themes'])
499                         self.index.add(doc)
500
501                         # Collect content.
502
503                     if text is not None and handle_text is not []:
504                         hdl = handle_text[-1]
505                         hdl(text)
506
507                         # in the end, add a section text.
508                 doc = add_part(snippets, header_index=position,
509                                header_type=header.tag, text=fix_format(content))
510
511                 self.index.add(doc)
512
513         finally:
514             snippets.close()
515
516
517 class SearchResult(object):
518     def __init__(self, doc, how_found=None, query=None, query_terms=None):
519         #        self.search = search
520         self.boost = 1.0
521         self._hits = []
522         self._processed_hits = None  # processed hits
523         self.snippets = []
524         self.query_terms = query_terms
525         self._book = None
526
527         if 'score' in doc:
528             self._score = doc['score']
529         else:
530             self._score = 0
531
532         self.book_id = int(doc["book_id"])
533
534         try:
535             self.published_date = int(doc.get("published_date"))
536         except ValueError:
537             self.published_date = 0
538
539         # content hits
540         header_type = doc.get("header_type", None)
541         # we have a content hit in some header of fragment
542         if header_type is not None:
543             sec = (header_type, int(doc["header_index"]))
544             header_span = doc['header_span']
545             header_span = header_span is not None and int(header_span) or 1
546             fragment = doc.get("fragment_anchor", None)
547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
548             snippets_rev = doc.get('snippets_revision', None)
549
550             hit = (sec + (header_span,), fragment, self._score, {
551                 'how_found': how_found,
552                 'snippets_pos': snippets_pos,
553                 'snippets_revision': snippets_rev,
554                 'themes': doc.get('themes', []),
555                 'themes_pl': doc.get('themes_pl', [])
556                 })
557
558             self._hits.append(hit)
559
560     def __unicode__(self):
561         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
562             (self.book_id, len(self._hits),
563              len(self._processed_hits) if self._processed_hits else -1,
564              self._score, len(self.snippets))
565
566     def __str__(self):
567         return unicode(self).encode('utf-8')
568
569     @property
570     def score(self):
571         return self._score * self.boost
572
573     def merge(self, other):
574         if self.book_id != other.book_id:
575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
576         self._hits += other._hits
577         if other.score > self.score:
578             self._score = other._score
579         return self
580
581     def get_book(self):
582         if self._book is not None:
583             return self._book
584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
585         return self._book
586
587     book = property(get_book)
588
589     POSITION = 0
590     FRAGMENT = 1
591     POSITION_INDEX = 1
592     POSITION_SPAN = 2
593     SCORE = 2
594     OTHER = 3
595
596     @property
597     def hits(self):
598         if self._processed_hits is not None:
599             return self._processed_hits
600
601         # to sections and fragments
602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
603
604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
605
606         # sections not covered by fragments
607         sect = filter(lambda s: 0 == len(filter(
608             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
609                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
610
611         def remove_duplicates(lst, keyfn, compare):
612             els = {}
613             for e in lst:
614                 eif = keyfn(e)
615                 if eif in els:
616                     if compare(els[eif], e) >= 1:
617                         continue
618                 els[eif] = e
619             return els.values()
620
621         # remove fragments with duplicated fid's and duplicated snippets
622         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
623         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
624         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
625
626         # remove duplicate sections
627         sections = {}
628
629         for s in sect:
630             si = s[self.POSITION][self.POSITION_INDEX]
631             # skip existing
632             if si in sections:
633                 if sections[si]['score'] >= s[self.SCORE]:
634                     continue
635
636             m = {'score': s[self.SCORE],
637                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
638                  }
639             m.update(s[self.OTHER])
640             sections[si] = m
641
642         hits = sections.values()
643
644         for f in frags:
645             try:
646                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
647             except catalogue.models.Fragment.DoesNotExist:
648                 # stale index
649                 continue
650             # Figure out if we were searching for a token matching some word in theme name.
651             themes = frag.tags.filter(category='theme')
652             themes_hit = set()
653             if self.query_terms is not None:
654                 for i in range(0, len(f[self.OTHER]['themes'])):
655                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
656                     tms = map(unicode.lower, tms)
657                     for qt in self.query_terms:
658                         if qt in tms:
659                             themes_hit.add(f[self.OTHER]['themes'][i])
660                             break
661
662             def theme_by_name(n):
663                 th = filter(lambda t: t.name == n, themes)
664                 if th:
665                     return th[0]
666                 else:
667                     return None
668             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
669
670             m = {'score': f[self.SCORE],
671                  'fragment': frag,
672                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
673                  'themes': themes,
674                  'themes_hit': themes_hit
675                  }
676             m.update(f[self.OTHER])
677             hits.append(m)
678
679         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
680
681         self._processed_hits = hits
682
683         return hits
684
685     @staticmethod
686     def aggregate(*result_lists):
687         books = {}
688         for rl in result_lists:
689             for r in rl:
690                 if r.book_id in books:
691                     books[r.book_id].merge(r)
692                 else:
693                     books[r.book_id] = r
694         return books.values()
695
696     def __cmp__(self, other):
697         c = cmp(self.score, other.score)
698         if c == 0:
699             # this is inverted, because earlier date is better
700             return cmp(other.published_date, self.published_date)
701         else:
702             return c
703
704     def __len__(self):
705         return len(self.hits)
706
707     def snippet_pos(self, idx=0):
708         return self.hits[idx]['snippets_pos']
709
710     def snippet_revision(self, idx=0):
711         try:
712             return self.hits[idx]['snippets_revision']
713         except (IndexError, KeyError):
714             return None
715
716
717 class Search(SolrIndex):
718     """
719     Search facilities.
720     """
721     def __init__(self, default_field="text"):
722         super(Search, self).__init__(mode='r')
723
724     def make_term_query(self, query, field='text', modal=operator.or_):
725         """
726         Returns term queries joined by boolean query.
727         modal - applies to boolean query
728         fuzzy - should the query by fuzzy.
729         """
730         if query is None:
731             query = ''
732         q = self.index.Q()
733         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
734
735         return q
736
737     def search_phrase(self, searched, field='text', book=False,
738                       filters=None,
739                       snippets=False):
740         if filters is None:
741             filters = []
742         if book:
743             filters.append(self.index.Q(is_book=True))
744
745         q = self.index.query(**{field: searched})
746         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
747         res = q.execute()
748         return [SearchResult(found, how_found=u'search_phrase') for found in res]
749
750     def search_some(self, searched, fields, book=True,
751                     filters=None, snippets=True, query_terms=None):
752         assert isinstance(fields, list)
753         if filters is None:
754             filters = []
755         if book:
756             filters.append(self.index.Q(is_book=True))
757
758         query = self.index.Q()
759
760         for fld in fields:
761             query = self.index.Q(query | self.make_term_query(searched, fld))
762
763         query = self.index.query(query)
764         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
765         res = query.execute()
766         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
767
768     def search_everywhere(self, searched, query_terms=None):
769         """
770         Tries to use search terms to match different fields of book (or its parts).
771         E.g. one word can be an author survey, another be a part of the title, and the rest
772         are some words from third chapter.
773         """
774         books = []
775         # content only query : themes x content
776         q = self.make_term_query(searched, 'text')
777         q_themes = self.make_term_query(searched, 'themes_pl')
778
779         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
780         res = query.execute()
781
782         for found in res:
783             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
784
785         # query themes/content x author/title/tags
786         in_content = self.index.Q()
787         in_meta = self.index.Q()
788
789         for fld in ['themes_pl', 'text']:
790             in_content |= self.make_term_query(searched, field=fld)
791
792         for fld in ['tags', 'authors', 'title']:
793             in_meta |= self.make_term_query(searched, field=fld)
794
795         q = in_content & in_meta
796         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
797
798         for found in res:
799             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
800
801         return books
802
803     def get_snippets(self, searchresult, query, field='text', num=1):
804         """
805         Returns a snippet for found scoreDoc.
806         """
807         maxnum = len(searchresult)
808         if num is None or num < 0 or num > maxnum:
809             num = maxnum
810         book_id = searchresult.book_id
811         revision = searchresult.snippet_revision()
812         snippets = Snippets(book_id, revision=revision)
813         snips = [None] * maxnum
814         try:
815             snippets.open()
816             idx = 0
817             while idx < maxnum and num > 0:
818                 position, length = searchresult.snippet_pos(idx)
819                 if position is None or length is None:
820                     continue
821                 text = snippets.get((int(position),
822                                      int(length)))
823                 snip = self.index.highlight(text=text, field=field, q=query)
824                 snips[idx] = snip
825                 if snip:
826                     num -= 1
827                 idx += 1
828
829         except IOError, e:
830             book = catalogue.models.Book.objects.filter(id=book_id)
831             if not book:
832                 log.error("Book does not exist for book id = %d" % book_id)
833             elif not book.get().children.exists():
834                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
835             return []
836         finally:
837             snippets.close()
838
839             # remove verse end markers..
840         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
841
842         searchresult.snippets = snips
843
844         return snips
845
846     def hint_tags(self, query, pdcounter=True, prefix=True):
847         """
848         Return auto-complete hints for tags
849         using prefix search.
850         """
851         q = self.index.Q()
852         query = query.strip()
853         for field in ['tag_name', 'tag_name_pl']:
854             if prefix:
855                 q |= self.index.Q(**{field: query + "*"})
856             else:
857                 q |= self.make_term_query(query, field=field)
858         qu = self.index.query(q)
859
860         return self.search_tags(qu, pdcounter=pdcounter)
861
862     def search_tags(self, query, filters=None, pdcounter=False):
863         """
864         Search for Tag objects using query.
865         """
866         if not filters:
867             filters = []
868         if not pdcounter:
869             filters.append(~self.index.Q(is_pdcounter=True))
870         res = self.apply_filters(query, filters).execute()
871
872         tags = []
873         pd_tags = []
874
875         for doc in res:
876             is_pdcounter = doc.get('is_pdcounter', False)
877             category = doc.get('tag_category')
878             try:
879                 if is_pdcounter:
880                     if category == 'pd_author':
881                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
882                     elif category == 'pd_book':
883                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
884                         tag.category = 'pd_book'  # make it look more lik a tag.
885                     else:
886                         # WTF
887                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
888                             int(doc.get('tag_id')), category)).encode('utf-8')
889                     pd_tags.append(tag)
890                 else:
891                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
892                     tags.append(tag)
893
894             except catalogue.models.Tag.DoesNotExist:
895                 pass
896             except PDCounterAuthor.DoesNotExist:
897                 pass
898             except PDCounterBook.DoesNotExist:
899                 pass
900
901         tags_slugs = set(map(lambda t: t.slug, tags))
902         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
903
904         log.debug('search_tags: %s' % tags)
905
906         return tags
907
908     def hint_books(self, query, prefix=True):
909         """
910         Returns auto-complete hints for book titles
911         Because we do not index 'pseudo' title-tags.
912         Prefix search.
913         """
914         q = self.index.Q()
915         query = query.strip()
916         if prefix:
917             q |= self.index.Q(title=query + "*")
918         else:
919             q |= self.make_term_query(query, field='title')
920         qu = self.index.query(q)
921         only_books = self.index.Q(is_book=True)
922         return self.search_books(qu, [only_books])
923
924     def search_books(self, query, filters=None, max_results=10):
925         """
926         Searches for Book objects using query
927         """
928         bks = []
929         bks_found = set()
930         query = query.query(is_book=True)
931         res = self.apply_filters(query, filters).field_limit(['book_id'])
932         for r in res:
933             try:
934                 bid = r['book_id']
935                 if bid not in bks_found:
936                     bks.append(catalogue.models.Book.objects.get(id=bid))
937                     bks_found.add(bid)
938             except catalogue.models.Book.DoesNotExist:
939                 pass
940         return bks
941
942     @staticmethod
943     def apply_filters(query, filters):
944         """
945         Apply filters to a query
946         """
947         if filters is None:
948             filters = []
949         filters = filter(lambda x: x is not None, filters)
950         for f in filters:
951             query = query.query(f)
952         return query
953
954
955 if getattr(settings, 'SEARCH_MOCK', False):
956     from .mock_search import Search