update librarian
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23
24 class SolrIndex(object):
25     def __init__(self, mode=None):
26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
27
28
29 class Snippets(object):
30     """
31     This class manages snippet files for indexed object (book)
32     the snippets are concatenated together, and their positions and
33     lengths are kept in lucene index fields.
34     """
35     SNIPPET_DIR = "snippets"
36
37     def __init__(self, book_id, revision=None):
38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         self.book_id = book_id
40         self.revision = revision
41         self.file = None
42         self.position = None
43
44     @property
45     def path(self):
46         if self.revision:
47             fn = "%d.%d" % (self.book_id, self.revision)
48         else:
49             fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if 'b' not in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         if self.file:
96             self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         log.debug("Indexing tags")
149         remove_only = kw.get('remove_only', False)
150         # first, remove tags from index.
151         if tags:
152             tag_qs = []
153             for tag in tags:
154                 q_id = self.index.Q(tag_id=tag.id)
155
156                 if isinstance(tag, PDCounterAuthor):
157                     q_cat = self.index.Q(tag_category='pd_author')
158                 elif isinstance(tag, PDCounterBook):
159                     q_cat = self.index.Q(tag_category='pd_book')
160                 else:
161                     q_cat = self.index.Q(tag_category=tag.category)
162
163                 q_id_cat = self.index.Q(q_id & q_cat)
164                 tag_qs.append(q_id_cat)
165             self.delete_query(*tag_qs)
166         else:  # all
167             q = self.index.Q(tag_id__any=True)
168             self.delete_query(q)
169
170         if not remove_only:
171             # then add them [all or just one passed]
172             if not tags:
173                 tags = chain(
174                     catalogue.models.Tag.objects.exclude(category='set'),
175                     PDCounterAuthor.objects.all(),
176                     PDCounterBook.objects.all())
177
178             for tag in tags:
179                 if isinstance(tag, PDCounterAuthor):
180                     doc = {
181                         "tag_id": int(tag.id),
182                         "tag_name": tag.name,
183                         "tag_name_pl": tag.name,
184                         "tag_category": 'pd_author',
185                         "is_pdcounter": True,
186                         "uid": "tag%d_pd_a" % tag.id
187                         }
188                 elif isinstance(tag, PDCounterBook):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.title,
192                         "tag_name_pl": tag.title,
193                         "tag_category": 'pd_book',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_b" % tag.id
196                         }
197                 else:
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.name,
201                         "tag_name_pl": tag.name,
202                         "tag_category": tag.category,
203                         "is_pdcounter": False,
204                         "uid": "tag%d" % tag.id
205                         }
206                 self.index.add(doc)
207
208     def create_book_doc(self, book):
209         """
210         Create a lucene document referring book id.
211         """
212         doc = {'book_id': int(book.id)}
213         if book.parent is not None:
214             doc['parent_id'] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=[
244             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         self.index.add(book_doc)
254         del book_doc
255         book_fields = {
256             'title': meta_fields['title'],
257             'authors': meta_fields['authors'],
258             'published_date': meta_fields['published_date']
259             }
260
261         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
262             if tag_name in meta_fields:
263                 book_fields[tag_name] = meta_fields[tag_name]
264
265         self.index_content(book, book_fields=book_fields)
266
267     master_tags = [
268         'opowiadanie',
269         'powiesc',
270         'dramat_wierszowany_l',
271         'dramat_wierszowany_lp',
272         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
273         'wywiad',
274     ]
275
276     ignore_content_tags = [
277         'uwaga', 'extra', 'nota_red', 'abstrakt',
278         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
279         'didaskalia',
280         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
281     ]
282
283     footnote_tags = ['pa', 'pt', 'pr', 'pe']
284
285     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
286                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
287
288     published_date_re = re.compile("([0-9]+)[\]. ]*$")
289
290     def extract_metadata(self, book, book_info=None, dc_only=None):
291         """
292         Extract metadata from book and returns a map of fields keyed by fieldname
293         """
294         fields = {}
295
296         if book_info is None:
297             book_info = dcparser.parse(open(book.xml_file.path))
298
299         fields['slug'] = book.slug
300         fields['tags'] = [t.name for t in book.tags]
301         fields['is_book'] = True
302
303         # validator, name
304         for field in dcparser.BookInfo.FIELDS:
305             if dc_only and field.name not in dc_only:
306                 continue
307             if hasattr(book_info, field.name):
308                 if not getattr(book_info, field.name):
309                     continue
310                 # since no type information is available, we use validator
311                 type_indicator = field.validator
312                 if type_indicator == dcparser.as_unicode:
313                     s = getattr(book_info, field.name)
314                     if field.multiple:
315                         s = ', '.join(s)
316                     fields[field.name] = s
317                 elif type_indicator == dcparser.as_person:
318                     p = getattr(book_info, field.name)
319                     if isinstance(p, dcparser.Person):
320                         persons = unicode(p)
321                     else:
322                         persons = ', '.join(map(unicode, p))
323                     fields[field.name] = persons
324                 elif type_indicator == dcparser.as_date:
325                     dt = getattr(book_info, field.name)
326                     fields[field.name] = dt
327
328         # get published date
329         pd = None
330         if hasattr(book_info, 'source_name') and book_info.source_name:
331             match = self.published_date_re.search(book_info.source_name)
332             if match is not None:
333                 pd = str(match.groups()[0])
334         if not pd:
335             pd = ""
336         fields["published_date"] = pd
337
338         return fields
339
340     # def add_gaps(self, fields, fieldname):
341     #     """
342     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
343     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
344     #     """
345     #     def gap():
346     #         while True:
347     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
348     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
349
350     def get_master(self, root):
351         """
352         Returns the first master tag from an etree.
353         """
354         for master in root.iter():
355             if master.tag in self.master_tags:
356                 return master
357
358     def index_content(self, book, book_fields):
359         """
360         Walks the book XML and extract content from it.
361         Adds parts for each header tag and for each fragment.
362         """
363         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
364         root = wld.edoc.getroot()
365
366         master = self.get_master(root)
367         if master is None:
368             return []
369
370         def walker(node):
371             if node.tag not in self.ignore_content_tags:
372                 yield node, None, None
373                 if node.text is not None:
374                     yield None, node.text, None
375                 for child in list(node):
376                     for b, t, e in walker(child):
377                         yield b, t, e
378                 yield None, None, node
379
380             if node.tail is not None:
381                 yield None, node.tail, None
382             return
383
384         def fix_format(text):
385             # separator = [u" ", u"\t", u".", u";", u","]
386             if isinstance(text, list):
387                 # need to join it first
388                 text = filter(lambda s: s is not None, content)
389                 text = u' '.join(text)
390                 # for i in range(len(text)):
391                 #     if i > 0:
392                 #         if text[i][0] not in separator\
393                 #             and text[i - 1][-1] not in separator:
394                 #          text.insert(i, u" ")
395
396             return re.sub("(?m)/$", "", text)
397
398         def add_part(snippets, **fields):
399             doc = self.create_book_doc(book)
400             for n, v in book_fields.items():
401                 doc[n] = v
402
403             doc['header_index'] = fields["header_index"]
404             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
405             doc['header_type'] = fields['header_type']
406
407             doc['text'] = fields['text']
408
409             # snippets
410             snip_pos = snippets.add(fields["text"])
411
412             doc['snippets_position'] = snip_pos[0]
413             doc['snippets_length'] = snip_pos[1]
414             if snippets.revision:
415                 doc["snippets_revision"] = snippets.revision
416
417             if 'fragment_anchor' in fields:
418                 doc["fragment_anchor"] = fields['fragment_anchor']
419
420             if 'themes' in fields:
421                 doc['themes'] = fields['themes']
422             doc['uid'] = "part%s-%s-%s-%s" % (
423                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
424             return doc
425
426         fragments = {}
427         snippets = Snippets(book.id).open('w')
428         try:
429             for header, position in zip(list(master), range(len(master))):
430
431                 if header.tag in self.skip_header_tags:
432                     continue
433                 if header.tag is etree.Comment:
434                     continue
435
436                 # section content
437                 content = []
438                 footnote = []
439
440                 def all_content(text):
441                     for frag in fragments.values():
442                         frag['text'].append(text)
443                     content.append(text)
444                 handle_text = [all_content]
445
446                 for start, text, end in walker(header):
447                     # handle footnotes
448                     if start is not None and start.tag in self.footnote_tags:
449                         footnote = []
450
451                         def collect_footnote(t):
452                             footnote.append(t)
453
454                         handle_text.append(collect_footnote)
455                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
456                         handle_text.pop()
457                         doc = add_part(snippets, header_index=position, header_type=header.tag,
458                                        text=u''.join(footnote),
459                                        is_footnote=True)
460                         self.index.add(doc)
461                         footnote = []
462
463                     # handle fragments and themes.
464                     if start is not None and start.tag == 'begin':
465                         fid = start.attrib['id'][1:]
466                         fragments[fid] = {
467                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
468
469                     # themes for this fragment
470                     elif start is not None and start.tag == 'motyw':
471                         fid = start.attrib['id'][1:]
472                         handle_text.append(lambda text: None)
473                         if start.text is not None:
474                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
475                     elif end is not None and end.tag == 'motyw':
476                         handle_text.pop()
477
478                     elif start is not None and start.tag == 'end':
479                         fid = start.attrib['id'][1:]
480                         if fid not in fragments:
481                             continue  # a broken <end> node, skip it
482                         frag = fragments[fid]
483                         if not frag['themes']:
484                             continue  # empty themes list.
485                         del fragments[fid]
486
487                         doc = add_part(snippets,
488                                        header_type=frag['start_header'],
489                                        header_index=frag['start_section'],
490                                        header_span=position - frag['start_section'] + 1,
491                                        fragment_anchor=fid,
492                                        text=fix_format(frag['text']),
493                                        themes=frag['themes'])
494                         self.index.add(doc)
495
496                         # Collect content.
497
498                     if text is not None and handle_text is not []:
499                         hdl = handle_text[-1]
500                         hdl(text)
501
502                         # in the end, add a section text.
503                 doc = add_part(snippets, header_index=position,
504                                header_type=header.tag, text=fix_format(content))
505
506                 self.index.add(doc)
507
508         finally:
509             snippets.close()
510
511
512 class SearchResult(object):
513     def __init__(self, doc, how_found=None, query_terms=None):
514         self.boost = 1.0
515         self._hits = []
516         self._processed_hits = None  # processed hits
517         self.snippets = []
518         self.query_terms = query_terms
519         self._book = None
520
521         if 'score' in doc:
522             self._score = doc['score']
523         else:
524             self._score = 0
525
526         self.book_id = int(doc["book_id"])
527
528         try:
529             self.published_date = int(doc.get("published_date"))
530         except ValueError:
531             self.published_date = 0
532
533         # content hits
534         header_type = doc.get("header_type", None)
535         # we have a content hit in some header of fragment
536         if header_type is not None:
537             sec = (header_type, int(doc["header_index"]))
538             header_span = doc['header_span']
539             header_span = header_span is not None and int(header_span) or 1
540             fragment = doc.get("fragment_anchor", None)
541             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
542             snippets_rev = doc.get('snippets_revision', None)
543
544             hit = (sec + (header_span,), fragment, self._score, {
545                 'how_found': how_found,
546                 'snippets_pos': snippets_pos,
547                 'snippets_revision': snippets_rev,
548                 'themes': doc.get('themes', []),
549                 'themes_pl': doc.get('themes_pl', [])
550                 })
551
552             self._hits.append(hit)
553
554     def __unicode__(self):
555         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
556             (self.book_id, len(self._hits),
557              len(self._processed_hits) if self._processed_hits else -1,
558              self._score, len(self.snippets))
559
560     def __str__(self):
561         return unicode(self).encode('utf-8')
562
563     @property
564     def score(self):
565         return self._score * self.boost
566
567     def merge(self, other):
568         if self.book_id != other.book_id:
569             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
570         self._hits += other._hits
571         if other.score > self.score:
572             self._score = other._score
573         return self
574
575     def get_book(self):
576         if self._book is not None:
577             return self._book
578         self._book = catalogue.models.Book.objects.get(id=self.book_id)
579         return self._book
580
581     book = property(get_book)
582
583     POSITION = 0
584     FRAGMENT = 1
585     POSITION_INDEX = 1
586     POSITION_SPAN = 2
587     SCORE = 2
588     OTHER = 3
589
590     @property
591     def hits(self):
592         if self._processed_hits is not None:
593             return self._processed_hits
594
595         # to sections and fragments
596         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
597
598         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
599
600         # sections not covered by fragments
601         sect = filter(lambda s: 0 == len(filter(
602             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
603                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
604
605         def remove_duplicates(lst, keyfn, compare):
606             els = {}
607             for e in lst:
608                 eif = keyfn(e)
609                 if eif in els:
610                     if compare(els[eif], e) >= 1:
611                         continue
612                 els[eif] = e
613             return els.values()
614
615         # remove fragments with duplicated fid's and duplicated snippets
616         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
617         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
618         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
619
620         # remove duplicate sections
621         sections = {}
622
623         for s in sect:
624             si = s[self.POSITION][self.POSITION_INDEX]
625             # skip existing
626             if si in sections:
627                 if sections[si]['score'] >= s[self.SCORE]:
628                     continue
629
630             m = {'score': s[self.SCORE],
631                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
632                  }
633             m.update(s[self.OTHER])
634             sections[si] = m
635
636         hits = sections.values()
637
638         for f in frags:
639             try:
640                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
641             except catalogue.models.Fragment.DoesNotExist:
642                 # stale index
643                 continue
644             # Figure out if we were searching for a token matching some word in theme name.
645             themes = frag.tags.filter(category='theme')
646             themes_hit = set()
647             if self.query_terms is not None:
648                 for i in range(0, len(f[self.OTHER]['themes'])):
649                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
650                     tms = map(unicode.lower, tms)
651                     for qt in self.query_terms:
652                         if qt in tms:
653                             themes_hit.add(f[self.OTHER]['themes'][i])
654                             break
655
656             def theme_by_name(n):
657                 th = filter(lambda t: t.name == n, themes)
658                 if th:
659                     return th[0]
660                 else:
661                     return None
662             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
663
664             m = {'score': f[self.SCORE],
665                  'fragment': frag,
666                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
667                  'themes': themes,
668                  'themes_hit': themes_hit
669                  }
670             m.update(f[self.OTHER])
671             hits.append(m)
672
673         hits.sort(key=lambda h: h['score'], reverse=True)
674
675         self._processed_hits = hits
676
677         return hits
678
679     @staticmethod
680     def aggregate(*result_lists):
681         books = {}
682         for rl in result_lists:
683             for r in rl:
684                 if r.book_id in books:
685                     books[r.book_id].merge(r)
686                 else:
687                     books[r.book_id] = r
688         return books.values()
689
690     def __cmp__(self, other):
691         c = cmp(self.score, other.score)
692         if c == 0:
693             # this is inverted, because earlier date is better
694             return cmp(other.published_date, self.published_date)
695         else:
696             return c
697
698     def __len__(self):
699         return len(self.hits)
700
701     def snippet_pos(self, idx=0):
702         return self.hits[idx]['snippets_pos']
703
704     def snippet_revision(self, idx=0):
705         try:
706             return self.hits[idx]['snippets_revision']
707         except (IndexError, KeyError):
708             return None
709
710
711 class Search(SolrIndex):
712     """
713     Search facilities.
714     """
715     def __init__(self, default_field="text"):
716         super(Search, self).__init__(mode='r')
717
718     def make_term_query(self, query, field='text', modal=operator.or_):
719         """
720         Returns term queries joined by boolean query.
721         modal - applies to boolean query
722         fuzzy - should the query by fuzzy.
723         """
724         if query is None:
725             query = ''
726         q = self.index.Q()
727         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
728
729         return q
730
731     def search_words(self, words, fields, book=True):
732         filters = []
733         for word in words:
734             word_filter = None
735             for field in fields:
736                 q = self.index.Q(**{field: word})
737                 if word_filter is None:
738                     word_filter = q
739                 else:
740                     word_filter |= q
741             filters.append(word_filter)
742         if book:
743             query = self.index.query(is_book=True)
744         else:
745             query = self.index.query()
746         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
747         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
748
749     def get_snippets(self, searchresult, query, field='text', num=1):
750         """
751         Returns a snippet for found scoreDoc.
752         """
753         maxnum = len(searchresult)
754         if num is None or num < 0 or num > maxnum:
755             num = maxnum
756         book_id = searchresult.book_id
757         revision = searchresult.snippet_revision()
758         snippets = Snippets(book_id, revision=revision)
759         snips = [None] * maxnum
760         try:
761             snippets.open()
762             idx = 0
763             while idx < maxnum and num > 0:
764                 position, length = searchresult.snippet_pos(idx)
765                 if position is None or length is None:
766                     continue
767                 text = snippets.get((int(position),
768                                      int(length)))
769                 snip = self.index.highlight(text=text, field=field, q=query)
770                 if snip not in snips:
771                     snips[idx] = snip
772                     if snip:
773                         num -= 1
774                 idx += 1
775
776         except IOError, e:
777             book = catalogue.models.Book.objects.filter(id=book_id)
778             if not book:
779                 log.error("Book does not exist for book id = %d" % book_id)
780             elif not book.get().children.exists():
781                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
782             return []
783         finally:
784             snippets.close()
785
786             # remove verse end markers..
787         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
788
789         searchresult.snippets = snips
790
791         return snips
792
793     def hint_tags(self, query, pdcounter=True, prefix=True):
794         """
795         Return auto-complete hints for tags
796         using prefix search.
797         """
798         q = self.index.Q()
799         query = query.strip()
800         for field in ['tag_name', 'tag_name_pl']:
801             if prefix:
802                 q |= self.index.Q(**{field: query + "*"})
803             else:
804                 q |= self.make_term_query(query, field=field)
805         qu = self.index.query(q)
806
807         return self.search_tags(qu, pdcounter=pdcounter)
808
809     def search_tags(self, query, filters=None, pdcounter=False):
810         """
811         Search for Tag objects using query.
812         """
813         if not filters:
814             filters = []
815         if not pdcounter:
816             filters.append(~self.index.Q(is_pdcounter=True))
817         res = self.apply_filters(query, filters).execute()
818
819         tags = []
820         pd_tags = []
821
822         for doc in res:
823             is_pdcounter = doc.get('is_pdcounter', False)
824             category = doc.get('tag_category')
825             try:
826                 if is_pdcounter:
827                     if category == 'pd_author':
828                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
829                     else:  # category == 'pd_book':
830                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
831                         tag.category = 'pd_book'  # make it look more lik a tag.
832                     pd_tags.append(tag)
833                 else:
834                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
835                     tags.append(tag)
836
837             except catalogue.models.Tag.DoesNotExist:
838                 pass
839             except PDCounterAuthor.DoesNotExist:
840                 pass
841             except PDCounterBook.DoesNotExist:
842                 pass
843
844         tags_slugs = set(map(lambda t: t.slug, tags))
845         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
846
847         log.debug('search_tags: %s' % tags)
848
849         return tags
850
851     def hint_books(self, query, prefix=True):
852         """
853         Returns auto-complete hints for book titles
854         Because we do not index 'pseudo' title-tags.
855         Prefix search.
856         """
857         q = self.index.Q()
858         query = query.strip()
859         if prefix:
860             q |= self.index.Q(title=query + "*")
861             q |= self.index.Q(title_orig=query + "*")
862         else:
863             q |= self.make_term_query(query, field='title')
864             q |= self.make_term_query(query, field='title_orig')
865         qu = self.index.query(q)
866         only_books = self.index.Q(is_book=True)
867         return self.search_books(qu, [only_books])
868
869     def search_books(self, query, filters=None, max_results=10):
870         """
871         Searches for Book objects using query
872         """
873         bks = []
874         bks_found = set()
875         query = query.query(is_book=True)
876         res = self.apply_filters(query, filters).field_limit(['book_id'])
877         for r in res:
878             try:
879                 bid = r['book_id']
880                 if bid not in bks_found:
881                     bks.append(catalogue.models.Book.objects.get(id=bid))
882                     bks_found.add(bid)
883             except catalogue.models.Book.DoesNotExist:
884                 pass
885         return bks
886
887     @staticmethod
888     def apply_filters(query, filters):
889         """
890         Apply filters to a query
891         """
892         if filters is None:
893             filters = []
894         filters = filter(lambda x: x is not None, filters)
895         for f in filters:
896             query = query.query(f)
897         return query
898
899
900 if getattr(settings, 'SEARCH_MOCK', False):
901     from .mock_search import Search