f9fb4b2867232d5f9833d062085d185731194d22
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23
24 class SolrIndex(object):
25     def __init__(self, mode=None):
26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
27
28
29 class Snippets(object):
30     """
31     This class manages snippet files for indexed object (book)
32     the snippets are concatenated together, and their positions and
33     lengths are kept in lucene index fields.
34     """
35     SNIPPET_DIR = "snippets"
36
37     def __init__(self, book_id, revision=None):
38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         self.book_id = book_id
40         self.revision = revision
41         self.file = None
42         self.position = None
43
44     @property
45     def path(self):
46         if self.revision:
47             fn = "%d.%d" % (self.book_id, self.revision)
48         else:
49             fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if 'b' not in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         if self.file:
96             self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         log.debug("Indexing tags")
149         remove_only = kw.get('remove_only', False)
150         # first, remove tags from index.
151         if tags:
152             tag_qs = []
153             for tag in tags:
154                 q_id = self.index.Q(tag_id=tag.id)
155
156                 if isinstance(tag, PDCounterAuthor):
157                     q_cat = self.index.Q(tag_category='pd_author')
158                 elif isinstance(tag, PDCounterBook):
159                     q_cat = self.index.Q(tag_category='pd_book')
160                 else:
161                     q_cat = self.index.Q(tag_category=tag.category)
162
163                 q_id_cat = self.index.Q(q_id & q_cat)
164                 tag_qs.append(q_id_cat)
165             self.delete_query(*tag_qs)
166         else:  # all
167             q = self.index.Q(tag_id__any=True)
168             self.delete_query(q)
169
170         if not remove_only:
171             # then add them [all or just one passed]
172             if not tags:
173                 tags = chain(
174                     catalogue.models.Tag.objects.exclude(category='set'),
175                     PDCounterAuthor.objects.all(),
176                     PDCounterBook.objects.all())
177
178             for tag in tags:
179                 if isinstance(tag, PDCounterAuthor):
180                     doc = {
181                         "tag_id": int(tag.id),
182                         "tag_name": tag.name,
183                         "tag_name_pl": tag.name,
184                         "tag_category": 'pd_author',
185                         "is_pdcounter": True,
186                         "uid": "tag%d_pd_a" % tag.id
187                         }
188                 elif isinstance(tag, PDCounterBook):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.title,
192                         "tag_name_pl": tag.title,
193                         "tag_category": 'pd_book',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_b" % tag.id
196                         }
197                 else:
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.name,
201                         "tag_name_pl": tag.name,
202                         "tag_category": tag.category,
203                         "is_pdcounter": False,
204                         "uid": "tag%d" % tag.id
205                         }
206                 self.index.add(doc)
207
208     def create_book_doc(self, book):
209         """
210         Create a lucene document referring book id.
211         """
212         doc = {'book_id': int(book.id)}
213         if book.parent is not None:
214             doc['parent_id'] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
244         # let's not index it - it's only used for extracting publish date
245         if 'source_name' in meta_fields:
246             del meta_fields['source_name']
247
248         for n, f in meta_fields.items():
249             book_doc[n] = f
250
251         book_doc['uid'] = "book%s" % book_doc['book_id']
252         self.index.add(book_doc)
253         del book_doc
254         book_fields = {
255             'title': meta_fields['title'],
256             'authors': meta_fields['authors'],
257             'published_date': meta_fields['published_date']
258             }
259
260         if 'translators' in meta_fields:
261             book_fields['translators'] = meta_fields['translators']
262
263         self.index_content(book, book_fields=book_fields)
264
265     master_tags = [
266         'opowiadanie',
267         'powiesc',
268         'dramat_wierszowany_l',
269         'dramat_wierszowany_lp',
270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
271         'wywiad',
272         ]
273
274     ignore_content_tags = [
275         'uwaga', 'extra', 'nota_red',
276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
277         'didaskalia',
278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
279         ]
280
281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
282
283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
285
286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
287
288     def extract_metadata(self, book, book_info=None, dc_only=None):
289         """
290         Extract metadata from book and returns a map of fields keyed by fieldname
291         """
292         fields = {}
293
294         if book_info is None:
295             book_info = dcparser.parse(open(book.xml_file.path))
296
297         fields['slug'] = book.slug
298         fields['tags'] = [t.name for t in book.tags]
299         fields['is_book'] = True
300
301         # validator, name
302         for field in dcparser.BookInfo.FIELDS:
303             if dc_only and field.name not in dc_only:
304                 continue
305             if hasattr(book_info, field.name):
306                 if not getattr(book_info, field.name):
307                     continue
308                 # since no type information is available, we use validator
309                 type_indicator = field.validator
310                 if type_indicator == dcparser.as_unicode:
311                     s = getattr(book_info, field.name)
312                     if field.multiple:
313                         s = ', '.join(s)
314                     fields[field.name] = s
315                 elif type_indicator == dcparser.as_person:
316                     p = getattr(book_info, field.name)
317                     if isinstance(p, dcparser.Person):
318                         persons = unicode(p)
319                     else:
320                         persons = ', '.join(map(unicode, p))
321                     fields[field.name] = persons
322                 elif type_indicator == dcparser.as_date:
323                     dt = getattr(book_info, field.name)
324                     fields[field.name] = dt
325
326         # get published date
327         pd = None
328         if hasattr(book_info, 'source_name') and book_info.source_name:
329             match = self.published_date_re.search(book_info.source_name)
330             if match is not None:
331                 pd = str(match.groups()[0])
332         if not pd:
333             pd = ""
334         fields["published_date"] = pd
335
336         return fields
337
338     # def add_gaps(self, fields, fieldname):
339     #     """
340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
342     #     """
343     #     def gap():
344     #         while True:
345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
347
348     def get_master(self, root):
349         """
350         Returns the first master tag from an etree.
351         """
352         for master in root.iter():
353             if master.tag in self.master_tags:
354                 return master
355
356     def index_content(self, book, book_fields):
357         """
358         Walks the book XML and extract content from it.
359         Adds parts for each header tag and for each fragment.
360         """
361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362         root = wld.edoc.getroot()
363
364         master = self.get_master(root)
365         if master is None:
366             return []
367
368         def walker(node):
369             if node.tag not in self.ignore_content_tags:
370                 yield node, None, None
371                 if node.text is not None:
372                     yield None, node.text, None
373                 for child in list(node):
374                     for b, t, e in walker(child):
375                         yield b, t, e
376                 yield None, None, node
377
378             if node.tail is not None:
379                 yield None, node.tail, None
380             return
381
382         def fix_format(text):
383             # separator = [u" ", u"\t", u".", u";", u","]
384             if isinstance(text, list):
385                 # need to join it first
386                 text = filter(lambda s: s is not None, content)
387                 text = u' '.join(text)
388                 # for i in range(len(text)):
389                 #     if i > 0:
390                 #         if text[i][0] not in separator\
391                 #             and text[i - 1][-1] not in separator:
392                 #          text.insert(i, u" ")
393
394             return re.sub("(?m)/$", "", text)
395
396         def add_part(snippets, **fields):
397             doc = self.create_book_doc(book)
398             for n, v in book_fields.items():
399                 doc[n] = v
400
401             doc['header_index'] = fields["header_index"]
402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403             doc['header_type'] = fields['header_type']
404
405             doc['text'] = fields['text']
406
407             # snippets
408             snip_pos = snippets.add(fields["text"])
409
410             doc['snippets_position'] = snip_pos[0]
411             doc['snippets_length'] = snip_pos[1]
412             if snippets.revision:
413                 doc["snippets_revision"] = snippets.revision
414
415             if 'fragment_anchor' in fields:
416                 doc["fragment_anchor"] = fields['fragment_anchor']
417
418             if 'themes' in fields:
419                 doc['themes'] = fields['themes']
420             doc['uid'] = "part%s-%s-%s-%s" % (
421                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
422             return doc
423
424         fragments = {}
425         snippets = Snippets(book.id).open('w')
426         try:
427             for header, position in zip(list(master), range(len(master))):
428
429                 if header.tag in self.skip_header_tags:
430                     continue
431                 if header.tag is etree.Comment:
432                     continue
433
434                 # section content
435                 content = []
436                 footnote = []
437
438                 def all_content(text):
439                     for frag in fragments.values():
440                         frag['text'].append(text)
441                     content.append(text)
442                 handle_text = [all_content]
443
444                 for start, text, end in walker(header):
445                     # handle footnotes
446                     if start is not None and start.tag in self.footnote_tags:
447                         footnote = []
448
449                         def collect_footnote(t):
450                             footnote.append(t)
451
452                         handle_text.append(collect_footnote)
453                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
454                         handle_text.pop()
455                         doc = add_part(snippets, header_index=position, header_type=header.tag,
456                                        text=u''.join(footnote),
457                                        is_footnote=True)
458                         self.index.add(doc)
459                         footnote = []
460
461                     # handle fragments and themes.
462                     if start is not None and start.tag == 'begin':
463                         fid = start.attrib['id'][1:]
464                         fragments[fid] = {
465                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
466
467                     # themes for this fragment
468                     elif start is not None and start.tag == 'motyw':
469                         fid = start.attrib['id'][1:]
470                         handle_text.append(lambda text: None)
471                         if start.text is not None:
472                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
473                     elif end is not None and end.tag == 'motyw':
474                         handle_text.pop()
475
476                     elif start is not None and start.tag == 'end':
477                         fid = start.attrib['id'][1:]
478                         if fid not in fragments:
479                             continue  # a broken <end> node, skip it
480                         frag = fragments[fid]
481                         if not frag['themes']:
482                             continue  # empty themes list.
483                         del fragments[fid]
484
485                         doc = add_part(snippets,
486                                        header_type=frag['start_header'],
487                                        header_index=frag['start_section'],
488                                        header_span=position - frag['start_section'] + 1,
489                                        fragment_anchor=fid,
490                                        text=fix_format(frag['text']),
491                                        themes=frag['themes'])
492                         self.index.add(doc)
493
494                         # Collect content.
495
496                     if text is not None and handle_text is not []:
497                         hdl = handle_text[-1]
498                         hdl(text)
499
500                         # in the end, add a section text.
501                 doc = add_part(snippets, header_index=position,
502                                header_type=header.tag, text=fix_format(content))
503
504                 self.index.add(doc)
505
506         finally:
507             snippets.close()
508
509
510 class SearchResult(object):
511     def __init__(self, doc, how_found=None, query_terms=None):
512         self.boost = 1.0
513         self._hits = []
514         self._processed_hits = None  # processed hits
515         self.snippets = []
516         self.query_terms = query_terms
517         self._book = None
518
519         if 'score' in doc:
520             self._score = doc['score']
521         else:
522             self._score = 0
523
524         self.book_id = int(doc["book_id"])
525
526         try:
527             self.published_date = int(doc.get("published_date"))
528         except ValueError:
529             self.published_date = 0
530
531         # content hits
532         header_type = doc.get("header_type", None)
533         # we have a content hit in some header of fragment
534         if header_type is not None:
535             sec = (header_type, int(doc["header_index"]))
536             header_span = doc['header_span']
537             header_span = header_span is not None and int(header_span) or 1
538             fragment = doc.get("fragment_anchor", None)
539             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
540             snippets_rev = doc.get('snippets_revision', None)
541
542             hit = (sec + (header_span,), fragment, self._score, {
543                 'how_found': how_found,
544                 'snippets_pos': snippets_pos,
545                 'snippets_revision': snippets_rev,
546                 'themes': doc.get('themes', []),
547                 'themes_pl': doc.get('themes_pl', [])
548                 })
549
550             self._hits.append(hit)
551
552     def __unicode__(self):
553         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
554             (self.book_id, len(self._hits),
555              len(self._processed_hits) if self._processed_hits else -1,
556              self._score, len(self.snippets))
557
558     def __str__(self):
559         return unicode(self).encode('utf-8')
560
561     @property
562     def score(self):
563         return self._score * self.boost
564
565     def merge(self, other):
566         if self.book_id != other.book_id:
567             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
568         self._hits += other._hits
569         if other.score > self.score:
570             self._score = other._score
571         return self
572
573     def get_book(self):
574         if self._book is not None:
575             return self._book
576         self._book = catalogue.models.Book.objects.get(id=self.book_id)
577         return self._book
578
579     book = property(get_book)
580
581     POSITION = 0
582     FRAGMENT = 1
583     POSITION_INDEX = 1
584     POSITION_SPAN = 2
585     SCORE = 2
586     OTHER = 3
587
588     @property
589     def hits(self):
590         if self._processed_hits is not None:
591             return self._processed_hits
592
593         # to sections and fragments
594         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
595
596         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
597
598         # sections not covered by fragments
599         sect = filter(lambda s: 0 == len(filter(
600             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
601                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
602
603         def remove_duplicates(lst, keyfn, compare):
604             els = {}
605             for e in lst:
606                 eif = keyfn(e)
607                 if eif in els:
608                     if compare(els[eif], e) >= 1:
609                         continue
610                 els[eif] = e
611             return els.values()
612
613         # remove fragments with duplicated fid's and duplicated snippets
614         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
615         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
616         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
617
618         # remove duplicate sections
619         sections = {}
620
621         for s in sect:
622             si = s[self.POSITION][self.POSITION_INDEX]
623             # skip existing
624             if si in sections:
625                 if sections[si]['score'] >= s[self.SCORE]:
626                     continue
627
628             m = {'score': s[self.SCORE],
629                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
630                  }
631             m.update(s[self.OTHER])
632             sections[si] = m
633
634         hits = sections.values()
635
636         for f in frags:
637             try:
638                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
639             except catalogue.models.Fragment.DoesNotExist:
640                 # stale index
641                 continue
642             # Figure out if we were searching for a token matching some word in theme name.
643             themes = frag.tags.filter(category='theme')
644             themes_hit = set()
645             if self.query_terms is not None:
646                 for i in range(0, len(f[self.OTHER]['themes'])):
647                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
648                     tms = map(unicode.lower, tms)
649                     for qt in self.query_terms:
650                         if qt in tms:
651                             themes_hit.add(f[self.OTHER]['themes'][i])
652                             break
653
654             def theme_by_name(n):
655                 th = filter(lambda t: t.name == n, themes)
656                 if th:
657                     return th[0]
658                 else:
659                     return None
660             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
661
662             m = {'score': f[self.SCORE],
663                  'fragment': frag,
664                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
665                  'themes': themes,
666                  'themes_hit': themes_hit
667                  }
668             m.update(f[self.OTHER])
669             hits.append(m)
670
671         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
672
673         self._processed_hits = hits
674
675         return hits
676
677     @staticmethod
678     def aggregate(*result_lists):
679         books = {}
680         for rl in result_lists:
681             for r in rl:
682                 if r.book_id in books:
683                     books[r.book_id].merge(r)
684                 else:
685                     books[r.book_id] = r
686         return books.values()
687
688     def __cmp__(self, other):
689         c = cmp(self.score, other.score)
690         if c == 0:
691             # this is inverted, because earlier date is better
692             return cmp(other.published_date, self.published_date)
693         else:
694             return c
695
696     def __len__(self):
697         return len(self.hits)
698
699     def snippet_pos(self, idx=0):
700         return self.hits[idx]['snippets_pos']
701
702     def snippet_revision(self, idx=0):
703         try:
704             return self.hits[idx]['snippets_revision']
705         except (IndexError, KeyError):
706             return None
707
708
709 class Search(SolrIndex):
710     """
711     Search facilities.
712     """
713     def __init__(self, default_field="text"):
714         super(Search, self).__init__(mode='r')
715
716     def make_term_query(self, query, field='text', modal=operator.or_):
717         """
718         Returns term queries joined by boolean query.
719         modal - applies to boolean query
720         fuzzy - should the query by fuzzy.
721         """
722         if query is None:
723             query = ''
724         q = self.index.Q()
725         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
726
727         return q
728
729     def search_phrase(self, searched, field='text', book=False,
730                       filters=None,
731                       snippets=False):
732         if filters is None:
733             filters = []
734         if book:
735             filters.append(self.index.Q(is_book=True))
736
737         q = self.index.query(**{field: searched})
738         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
739         res = q.paginate(rows=100).execute()
740         return [SearchResult(found, how_found=u'search_phrase') for found in res]
741
742     def search_some(self, searched, fields, book=True,
743                     filters=None, snippets=True, query_terms=None):
744         assert isinstance(fields, list)
745         if filters is None:
746             filters = []
747         if book:
748             filters.append(self.index.Q(is_book=True))
749
750         query = self.index.Q()
751
752         for fld in fields:
753             query = self.index.Q(query | self.make_term_query(searched, fld))
754
755         query = self.index.query(query)
756         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
757         res = query.execute()
758         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
759
760     def search_everywhere(self, searched, query_terms=None):
761         """
762         Tries to use search terms to match different fields of book (or its parts).
763         E.g. one word can be an author survey, another be a part of the title, and the rest
764         are some words from third chapter.
765         """
766         books = []
767         # content only query : themes x content
768         q = self.make_term_query(searched, 'text')
769         q_themes = self.make_term_query(searched, 'themes_pl')
770
771         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
772         res = query.execute()
773
774         for found in res:
775             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
776
777         # query themes/content x author/title/tags
778         in_content = self.index.Q()
779         in_meta = self.index.Q()
780
781         for fld in ['themes_pl', 'text']:
782             in_content |= self.make_term_query(searched, field=fld)
783
784         for fld in ['tags', 'authors', 'title']:
785             in_meta |= self.make_term_query(searched, field=fld)
786
787         q = in_content & in_meta
788         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
789
790         for found in res:
791             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
792
793         return books
794
795     def get_snippets(self, searchresult, query, field='text', num=1):
796         """
797         Returns a snippet for found scoreDoc.
798         """
799         maxnum = len(searchresult)
800         if num is None or num < 0 or num > maxnum:
801             num = maxnum
802         book_id = searchresult.book_id
803         revision = searchresult.snippet_revision()
804         snippets = Snippets(book_id, revision=revision)
805         snips = [None] * maxnum
806         try:
807             snippets.open()
808             idx = 0
809             while idx < maxnum and num > 0:
810                 position, length = searchresult.snippet_pos(idx)
811                 if position is None or length is None:
812                     continue
813                 text = snippets.get((int(position),
814                                      int(length)))
815                 snip = self.index.highlight(text=text, field=field, q=query)
816                 if snip not in snips:
817                     snips[idx] = snip
818                     if snip:
819                         num -= 1
820                 idx += 1
821
822         except IOError, e:
823             book = catalogue.models.Book.objects.filter(id=book_id)
824             if not book:
825                 log.error("Book does not exist for book id = %d" % book_id)
826             elif not book.get().children.exists():
827                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
828             return []
829         finally:
830             snippets.close()
831
832             # remove verse end markers..
833         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
834
835         searchresult.snippets = snips
836
837         return snips
838
839     def hint_tags(self, query, pdcounter=True, prefix=True):
840         """
841         Return auto-complete hints for tags
842         using prefix search.
843         """
844         q = self.index.Q()
845         query = query.strip()
846         for field in ['tag_name', 'tag_name_pl']:
847             if prefix:
848                 q |= self.index.Q(**{field: query + "*"})
849             else:
850                 q |= self.make_term_query(query, field=field)
851         qu = self.index.query(q)
852
853         return self.search_tags(qu, pdcounter=pdcounter)
854
855     def search_tags(self, query, filters=None, pdcounter=False):
856         """
857         Search for Tag objects using query.
858         """
859         if not filters:
860             filters = []
861         if not pdcounter:
862             filters.append(~self.index.Q(is_pdcounter=True))
863         res = self.apply_filters(query, filters).execute()
864
865         tags = []
866         pd_tags = []
867
868         for doc in res:
869             is_pdcounter = doc.get('is_pdcounter', False)
870             category = doc.get('tag_category')
871             try:
872                 if is_pdcounter:
873                     if category == 'pd_author':
874                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
875                     elif category == 'pd_book':
876                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
877                         tag.category = 'pd_book'  # make it look more lik a tag.
878                     else:
879                         # WTF
880                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
881                             int(doc.get('tag_id')), category)).encode('utf-8')
882                     pd_tags.append(tag)
883                 else:
884                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
885                     tags.append(tag)
886
887             except catalogue.models.Tag.DoesNotExist:
888                 pass
889             except PDCounterAuthor.DoesNotExist:
890                 pass
891             except PDCounterBook.DoesNotExist:
892                 pass
893
894         tags_slugs = set(map(lambda t: t.slug, tags))
895         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
896
897         log.debug('search_tags: %s' % tags)
898
899         return tags
900
901     def hint_books(self, query, prefix=True):
902         """
903         Returns auto-complete hints for book titles
904         Because we do not index 'pseudo' title-tags.
905         Prefix search.
906         """
907         q = self.index.Q()
908         query = query.strip()
909         if prefix:
910             q |= self.index.Q(title=query + "*")
911             q |= self.index.Q(title_orig=query + "*")
912         else:
913             q |= self.make_term_query(query, field='title')
914             q |= self.make_term_query(query, field='title_orig')
915         qu = self.index.query(q)
916         only_books = self.index.Q(is_book=True)
917         return self.search_books(qu, [only_books])
918
919     def search_books(self, query, filters=None, max_results=10):
920         """
921         Searches for Book objects using query
922         """
923         bks = []
924         bks_found = set()
925         query = query.query(is_book=True)
926         res = self.apply_filters(query, filters).field_limit(['book_id'])
927         for r in res:
928             try:
929                 bid = r['book_id']
930                 if bid not in bks_found:
931                     bks.append(catalogue.models.Book.objects.get(id=bid))
932                     bks_found.add(bid)
933             except catalogue.models.Book.DoesNotExist:
934                 pass
935         return bks
936
937     @staticmethod
938     def apply_filters(query, filters):
939         """
940         Apply filters to a query
941         """
942         if filters is None:
943             filters = []
944         filters = filter(lambda x: x is not None, filters)
945         for f in filters:
946             query = query.query(f)
947         return query
948
949
950 if getattr(settings, 'SEARCH_MOCK', False):
951     from .mock_search import Search