cleanup solr schema
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23 if os.path.isfile(settings.SOLR_STOPWORDS):
24     stopwords = set(
25         line.decode('utf-8').strip()
26         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
27 else:
28     stopwords = set()
29
30
31 class SolrIndex(object):
32     def __init__(self, mode=None):
33         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
34
35
36 class Snippets(object):
37     """
38     This class manages snippet files for indexed object (book)
39     the snippets are concatenated together, and their positions and
40     lengths are kept in lucene index fields.
41     """
42     SNIPPET_DIR = "snippets"
43
44     def __init__(self, book_id, revision=None):
45         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
46         self.book_id = book_id
47         self.revision = revision
48         self.file = None
49         self.position = None
50
51     @property
52     def path(self):
53         if self.revision:
54             fn = "%d.%d" % (self.book_id, self.revision)
55         else:
56             fn = "%d" % self.book_id
57
58         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
59
60     def open(self, mode='r'):
61         """
62         Open the snippet file. Call .close() afterwards.
63         """
64         if 'b' not in mode:
65             mode += 'b'
66
67         if 'w' in mode:
68             if os.path.exists(self.path):
69                 self.revision = 1
70                 while True:
71                     if not os.path.exists(self.path):
72                         break
73                     self.revision += 1
74
75         self.file = open(self.path, mode)
76         self.position = 0
77         return self
78
79     def add(self, snippet):
80         """
81         Append a snippet (unicode) to the snippet file.
82         Return a (position, length) tuple
83         """
84         txt = snippet.encode('utf-8')
85         l = len(txt)
86         self.file.write(txt)
87         pos = (self.position, l)
88         self.position += l
89         return pos
90
91     def get(self, pos):
92         """
93         Given a tuple of (position, length) return an unicode
94         of the snippet stored there.
95         """
96         self.file.seek(pos[0], 0)
97         txt = self.file.read(pos[1]).decode('utf-8')
98         return txt
99
100     def close(self):
101         """Close snippet file"""
102         if self.file:
103             self.file.close()
104
105     def remove(self):
106         self.revision = None
107         try:
108             os.unlink(self.path)
109             self.revision = 0
110             while True:
111                 self.revision += 1
112                 os.unlink(self.path)
113         except OSError:
114             pass
115
116
117 class Index(SolrIndex):
118     """
119     Class indexing books.
120     """
121     def __init__(self):
122         super(Index, self).__init__(mode='rw')
123
124     def delete_query(self, *queries):
125         """
126         index.delete(queries=...) doesn't work, so let's reimplement it
127         using deletion of list of uids.
128         """
129         uids = set()
130         for q in queries:
131             if isinstance(q, sunburnt.search.LuceneQuery):
132                 q = self.index.query(q)
133             q.field_limiter.update(['uid'])
134             st = 0
135             rows = 100
136             while True:
137                 ids = q.paginate(start=st, rows=rows).execute()
138                 if not len(ids):
139                     break
140                 for res in ids:
141                     uids.add(res['uid'])
142                 st += rows
143         if uids:
144             self.index.delete(uids)
145             return True
146         else:
147             return False
148
149     def index_tags(self, *tags, **kw):
150         """
151         Re-index global tag list.
152         Removes all tags from index, then index them again.
153         Indexed fields include: id, name (with and without polish stems), category
154         """
155         log.debug("Indexing tags")
156         remove_only = kw.get('remove_only', False)
157         # first, remove tags from index.
158         if tags:
159             tag_qs = []
160             for tag in tags:
161                 q_id = self.index.Q(tag_id=tag.id)
162
163                 if isinstance(tag, PDCounterAuthor):
164                     q_cat = self.index.Q(tag_category='pd_author')
165                 elif isinstance(tag, PDCounterBook):
166                     q_cat = self.index.Q(tag_category='pd_book')
167                 else:
168                     q_cat = self.index.Q(tag_category=tag.category)
169
170                 q_id_cat = self.index.Q(q_id & q_cat)
171                 tag_qs.append(q_id_cat)
172             self.delete_query(*tag_qs)
173         else:  # all
174             q = self.index.Q(tag_id__any=True)
175             self.delete_query(q)
176
177         if not remove_only:
178             # then add them [all or just one passed]
179             if not tags:
180                 tags = chain(
181                     catalogue.models.Tag.objects.exclude(category='set'),
182                     PDCounterAuthor.objects.all(),
183                     PDCounterBook.objects.all())
184
185             for tag in tags:
186                 if isinstance(tag, PDCounterAuthor):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.name,
190                         "tag_name_pl": tag.name,
191                         "tag_category": 'pd_author',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_a" % tag.id
194                         }
195                 elif isinstance(tag, PDCounterBook):
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.title,
199                         "tag_name_pl": tag.title,
200                         "tag_category": 'pd_book',
201                         "is_pdcounter": True,
202                         "uid": "tag%d_pd_b" % tag.id
203                         }
204                 else:
205                     doc = {
206                         "tag_id": int(tag.id),
207                         "tag_name": tag.name,
208                         "tag_name_pl": tag.name,
209                         "tag_category": tag.category,
210                         "is_pdcounter": False,
211                         "uid": "tag%d" % tag.id
212                         }
213                 self.index.add(doc)
214
215     def create_book_doc(self, book):
216         """
217         Create a lucene document referring book id.
218         """
219         doc = {'book_id': int(book.id)}
220         if book.parent is not None:
221             doc['parent_id'] = int(book.parent.id)
222         return doc
223
224     def remove_book(self, book_or_id, remove_snippets=True):
225         """Removes a book from search index.
226         book - Book instance."""
227         if isinstance(book_or_id, catalogue.models.Book):
228             book_id = book_or_id.id
229         else:
230             book_id = book_or_id
231
232         self.delete_query(self.index.Q(book_id=book_id))
233
234         if remove_snippets:
235             snippets = Snippets(book_id)
236             snippets.remove()
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             # we don't remove snippets, since they might be still needed by
246             # threads using not reopened index
247             self.remove_book(book, remove_snippets=False)
248
249         book_doc = self.create_book_doc(book)
250         meta_fields = self.extract_metadata(book, book_info, dc_only=[
251             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
252         # let's not index it - it's only used for extracting publish date
253         if 'source_name' in meta_fields:
254             del meta_fields['source_name']
255
256         for n, f in meta_fields.items():
257             book_doc[n] = f
258
259         book_doc['uid'] = "book%s" % book_doc['book_id']
260         self.index.add(book_doc)
261         del book_doc
262         book_fields = {
263             'title': meta_fields['title'],
264             'authors': meta_fields['authors'],
265             'published_date': meta_fields['published_date']
266             }
267
268         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
269             if tag_name in meta_fields:
270                 book_fields[tag_name] = meta_fields[tag_name]
271
272         self.index_content(book, book_fields=book_fields)
273
274     master_tags = [
275         'opowiadanie',
276         'powiesc',
277         'dramat_wierszowany_l',
278         'dramat_wierszowany_lp',
279         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280         'wywiad',
281     ]
282
283     ignore_content_tags = [
284         'uwaga', 'extra', 'nota_red', 'abstrakt',
285         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
286         'didaskalia',
287         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
288     ]
289
290     footnote_tags = ['pa', 'pt', 'pr', 'pe']
291
292     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
293                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
294
295     published_date_re = re.compile("([0-9]+)[\]. ]*$")
296
297     def extract_metadata(self, book, book_info=None, dc_only=None):
298         """
299         Extract metadata from book and returns a map of fields keyed by fieldname
300         """
301         fields = {}
302
303         if book_info is None:
304             book_info = dcparser.parse(open(book.xml_file.path))
305
306         fields['slug'] = book.slug
307         fields['is_book'] = True
308
309         # validator, name
310         for field in dcparser.BookInfo.FIELDS:
311             if dc_only and field.name not in dc_only:
312                 continue
313             if hasattr(book_info, field.name):
314                 if not getattr(book_info, field.name):
315                     continue
316                 # since no type information is available, we use validator
317                 type_indicator = field.validator
318                 if type_indicator == dcparser.as_unicode:
319                     s = getattr(book_info, field.name)
320                     if field.multiple:
321                         s = ', '.join(s)
322                     fields[field.name] = s
323                 elif type_indicator == dcparser.as_person:
324                     p = getattr(book_info, field.name)
325                     if isinstance(p, dcparser.Person):
326                         persons = unicode(p)
327                     else:
328                         persons = ', '.join(map(unicode, p))
329                     fields[field.name] = persons
330                 elif type_indicator == dcparser.as_date:
331                     dt = getattr(book_info, field.name)
332                     fields[field.name] = dt
333
334         # get published date
335         pd = None
336         if hasattr(book_info, 'source_name') and book_info.source_name:
337             match = self.published_date_re.search(book_info.source_name)
338             if match is not None:
339                 pd = str(match.groups()[0])
340         if not pd:
341             pd = ""
342         fields["published_date"] = pd
343
344         return fields
345
346     # def add_gaps(self, fields, fieldname):
347     #     """
348     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
349     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
350     #     """
351     #     def gap():
352     #         while True:
353     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
354     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
355
356     def get_master(self, root):
357         """
358         Returns the first master tag from an etree.
359         """
360         for master in root.iter():
361             if master.tag in self.master_tags:
362                 return master
363
364     def index_content(self, book, book_fields):
365         """
366         Walks the book XML and extract content from it.
367         Adds parts for each header tag and for each fragment.
368         """
369         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
370         root = wld.edoc.getroot()
371
372         master = self.get_master(root)
373         if master is None:
374             return []
375
376         def walker(node):
377             if node.tag not in self.ignore_content_tags:
378                 yield node, None, None
379                 if node.text is not None:
380                     yield None, node.text, None
381                 for child in list(node):
382                     for b, t, e in walker(child):
383                         yield b, t, e
384                 yield None, None, node
385
386             if node.tail is not None:
387                 yield None, node.tail, None
388             return
389
390         def fix_format(text):
391             # separator = [u" ", u"\t", u".", u";", u","]
392             if isinstance(text, list):
393                 # need to join it first
394                 text = filter(lambda s: s is not None, content)
395                 text = u' '.join(text)
396                 # for i in range(len(text)):
397                 #     if i > 0:
398                 #         if text[i][0] not in separator\
399                 #             and text[i - 1][-1] not in separator:
400                 #          text.insert(i, u" ")
401
402             return re.sub("(?m)/$", "", text)
403
404         def add_part(snippets, **fields):
405             doc = self.create_book_doc(book)
406             for n, v in book_fields.items():
407                 doc[n] = v
408
409             doc['header_index'] = fields["header_index"]
410             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
411             doc['header_type'] = fields['header_type']
412
413             doc['text'] = fields['text']
414
415             # snippets
416             snip_pos = snippets.add(fields["text"])
417
418             doc['snippets_position'] = snip_pos[0]
419             doc['snippets_length'] = snip_pos[1]
420             if snippets.revision:
421                 doc["snippets_revision"] = snippets.revision
422
423             if 'fragment_anchor' in fields:
424                 doc["fragment_anchor"] = fields['fragment_anchor']
425
426             if 'themes' in fields:
427                 doc['themes'] = fields['themes']
428             doc['uid'] = "part%s-%s-%s-%s" % (
429                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
430             return doc
431
432         fragments = {}
433         snippets = Snippets(book.id).open('w')
434         try:
435             for header, position in zip(list(master), range(len(master))):
436
437                 if header.tag in self.skip_header_tags:
438                     continue
439                 if header.tag is etree.Comment:
440                     continue
441
442                 # section content
443                 content = []
444                 footnote = []
445
446                 def all_content(text):
447                     for frag in fragments.values():
448                         frag['text'].append(text)
449                     content.append(text)
450                 handle_text = [all_content]
451
452                 for start, text, end in walker(header):
453                     # handle footnotes
454                     if start is not None and start.tag in self.footnote_tags:
455                         footnote = []
456
457                         def collect_footnote(t):
458                             footnote.append(t)
459
460                         handle_text.append(collect_footnote)
461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
462                         handle_text.pop()
463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
464                                        text=u''.join(footnote),
465                                        is_footnote=True)
466                         self.index.add(doc)
467                         footnote = []
468
469                     # handle fragments and themes.
470                     if start is not None and start.tag == 'begin':
471                         fid = start.attrib['id'][1:]
472                         fragments[fid] = {
473                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
474
475                     # themes for this fragment
476                     elif start is not None and start.tag == 'motyw':
477                         fid = start.attrib['id'][1:]
478                         handle_text.append(lambda text: None)
479                         if start.text is not None:
480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481                     elif end is not None and end.tag == 'motyw':
482                         handle_text.pop()
483
484                     elif start is not None and start.tag == 'end':
485                         fid = start.attrib['id'][1:]
486                         if fid not in fragments:
487                             continue  # a broken <end> node, skip it
488                         frag = fragments[fid]
489                         if not frag['themes']:
490                             continue  # empty themes list.
491                         del fragments[fid]
492
493                         doc = add_part(snippets,
494                                        header_type=frag['start_header'],
495                                        header_index=frag['start_section'],
496                                        header_span=position - frag['start_section'] + 1,
497                                        fragment_anchor=fid,
498                                        text=fix_format(frag['text']),
499                                        themes=frag['themes'])
500                         self.index.add(doc)
501
502                         # Collect content.
503
504                     if text is not None and handle_text is not []:
505                         hdl = handle_text[-1]
506                         hdl(text)
507
508                         # in the end, add a section text.
509                 doc = add_part(snippets, header_index=position,
510                                header_type=header.tag, text=fix_format(content))
511
512                 self.index.add(doc)
513
514         finally:
515             snippets.close()
516
517
518 class SearchResult(object):
519     def __init__(self, doc, how_found=None, query_terms=None):
520         self.boost = 1.0
521         self._hits = []
522         self._processed_hits = None  # processed hits
523         self.snippets = []
524         self.query_terms = query_terms
525         self._book = None
526
527         if 'score' in doc:
528             self._score = doc['score']
529         else:
530             self._score = 0
531
532         self.book_id = int(doc["book_id"])
533
534         try:
535             self.published_date = int(doc.get("published_date"))
536         except ValueError:
537             self.published_date = 0
538
539         # content hits
540         header_type = doc.get("header_type", None)
541         # we have a content hit in some header of fragment
542         if header_type is not None:
543             sec = (header_type, int(doc["header_index"]))
544             header_span = doc['header_span']
545             header_span = header_span is not None and int(header_span) or 1
546             fragment = doc.get("fragment_anchor", None)
547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
548             snippets_rev = doc.get('snippets_revision', None)
549
550             hit = (sec + (header_span,), fragment, self._score, {
551                 'how_found': how_found,
552                 'snippets_pos': snippets_pos,
553                 'snippets_revision': snippets_rev,
554                 'themes': doc.get('themes', []),
555                 'themes_pl': doc.get('themes_pl', [])
556                 })
557
558             self._hits.append(hit)
559
560     @classmethod
561     def from_book(cls, book, how_found=None, query_terms=None):
562         doc = {
563             'score': book.popularity.count,
564             'book_id': book.id,
565             'published_date': 0,
566         }
567         result = cls(doc, how_found=how_found, query_terms=query_terms)
568         result._book = book
569         return result
570
571     def __unicode__(self):
572         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
573             (self.book_id, len(self._hits),
574              len(self._processed_hits) if self._processed_hits else -1,
575              self._score, len(self.snippets))
576
577     def __str__(self):
578         return unicode(self).encode('utf-8')
579
580     @property
581     def score(self):
582         return self._score * self.boost
583
584     def merge(self, other):
585         if self.book_id != other.book_id:
586             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
587         self._hits += other._hits
588         self._score += max(other._score, 0)
589         return self
590
591     def get_book(self):
592         if self._book is not None:
593             return self._book
594         self._book = catalogue.models.Book.objects.get(id=self.book_id)
595         return self._book
596
597     book = property(get_book)
598
599     POSITION = 0
600     FRAGMENT = 1
601     POSITION_INDEX = 1
602     POSITION_SPAN = 2
603     SCORE = 2
604     OTHER = 3
605
606     @property
607     def hits(self):
608         if self._processed_hits is not None:
609             return self._processed_hits
610
611         # to sections and fragments
612         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
613
614         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
615
616         # sections not covered by fragments
617         sect = filter(lambda s: 0 == len(filter(
618             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
619                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
620
621         def remove_duplicates(lst, keyfn, compare):
622             els = {}
623             for e in lst:
624                 eif = keyfn(e)
625                 if eif in els:
626                     if compare(els[eif], e) >= 1:
627                         continue
628                 els[eif] = e
629             return els.values()
630
631         # remove fragments with duplicated fid's and duplicated snippets
632         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
633         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
634         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
635
636         # remove duplicate sections
637         sections = {}
638
639         for s in sect:
640             si = s[self.POSITION][self.POSITION_INDEX]
641             # skip existing
642             if si in sections:
643                 if sections[si]['score'] >= s[self.SCORE]:
644                     continue
645
646             m = {'score': s[self.SCORE],
647                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
648                  }
649             m.update(s[self.OTHER])
650             sections[si] = m
651
652         hits = sections.values()
653
654         for f in frags:
655             try:
656                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
657             except catalogue.models.Fragment.DoesNotExist:
658                 # stale index
659                 continue
660             # Figure out if we were searching for a token matching some word in theme name.
661             themes = frag.tags.filter(category='theme')
662             themes_hit = set()
663             if self.query_terms is not None:
664                 for i in range(0, len(f[self.OTHER]['themes'])):
665                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
666                     tms = map(unicode.lower, tms)
667                     for qt in self.query_terms:
668                         if qt in tms:
669                             themes_hit.add(f[self.OTHER]['themes'][i])
670                             break
671
672             def theme_by_name(n):
673                 th = filter(lambda t: t.name == n, themes)
674                 if th:
675                     return th[0]
676                 else:
677                     return None
678             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
679
680             m = {'score': f[self.SCORE],
681                  'fragment': frag,
682                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
683                  'themes': themes,
684                  'themes_hit': themes_hit
685                  }
686             m.update(f[self.OTHER])
687             hits.append(m)
688
689         hits.sort(key=lambda h: h['score'], reverse=True)
690
691         self._processed_hits = hits
692
693         return hits
694
695     @staticmethod
696     def aggregate(*result_lists):
697         books = {}
698         for rl in result_lists:
699             for r in rl:
700                 if r.book_id in books:
701                     books[r.book_id].merge(r)
702                 else:
703                     books[r.book_id] = r
704         return books.values()
705
706     def __cmp__(self, other):
707         c = cmp(self.score, other.score)
708         if c == 0:
709             # this is inverted, because earlier date is better
710             return cmp(other.published_date, self.published_date)
711         else:
712             return c
713
714     def __len__(self):
715         return len(self.hits)
716
717     def snippet_pos(self, idx=0):
718         return self.hits[idx]['snippets_pos']
719
720     def snippet_revision(self, idx=0):
721         try:
722             return self.hits[idx]['snippets_revision']
723         except (IndexError, KeyError):
724             return None
725
726
727 class Search(SolrIndex):
728     """
729     Search facilities.
730     """
731     def __init__(self, default_field="text"):
732         super(Search, self).__init__(mode='r')
733
734     def make_term_query(self, query, field='text', modal=operator.or_):
735         """
736         Returns term queries joined by boolean query.
737         modal - applies to boolean query
738         fuzzy - should the query by fuzzy.
739         """
740         if query is None:
741             query = ''
742         q = self.index.Q()
743         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
744
745         return q
746
747     def search_by_author(self, words):
748         from catalogue.models import Book
749         books = Book.objects.filter(parent=None).order_by('-popularity__count')
750         for word in words:
751             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
752         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
753
754     def search_words(self, words, fields, book=True):
755         if book and fields == ['authors']:
756             return self.search_by_author(words)
757         filters = []
758         for word in words:
759             if book or (word not in stopwords):
760                 word_filter = None
761                 for field in fields:
762                     q = self.index.Q(**{field: word})
763                     if word_filter is None:
764                         word_filter = q
765                     else:
766                         word_filter |= q
767                 filters.append(word_filter)
768         if not filters:
769             return []
770         if book:
771             query = self.index.query(is_book=True)
772         else:
773             query = self.index.query()
774         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
775         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
776
777     def get_snippets(self, searchresult, query, field='text', num=1):
778         """
779         Returns a snippet for found scoreDoc.
780         """
781         maxnum = len(searchresult)
782         if num is None or num < 0 or num > maxnum:
783             num = maxnum
784         book_id = searchresult.book_id
785         revision = searchresult.snippet_revision()
786         snippets = Snippets(book_id, revision=revision)
787         snips = [None] * maxnum
788         try:
789             snippets.open()
790             idx = 0
791             while idx < maxnum and num > 0:
792                 position, length = searchresult.snippet_pos(idx)
793                 if position is None or length is None:
794                     continue
795                 text = snippets.get((int(position),
796                                      int(length)))
797                 snip = self.index.highlight(text=text, field=field, q=query)
798                 if snip not in snips:
799                     snips[idx] = snip
800                     if snip:
801                         num -= 1
802                 idx += 1
803
804         except IOError, e:
805             book = catalogue.models.Book.objects.filter(id=book_id)
806             if not book:
807                 log.error("Book does not exist for book id = %d" % book_id)
808             elif not book.get().children.exists():
809                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
810             return []
811         finally:
812             snippets.close()
813
814             # remove verse end markers..
815         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
816
817         searchresult.snippets = snips
818
819         return snips
820
821     @staticmethod
822     def apply_filters(query, filters):
823         """
824         Apply filters to a query
825         """
826         if filters is None:
827             filters = []
828         filters = filter(lambda x: x is not None, filters)
829         for f in filters:
830             query = query.query(f)
831         return query
832
833
834 if getattr(settings, 'SEARCH_MOCK', False):
835     from .mock_search import Search