ignore stopwords in query
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23 if os.path.isfile(settings.SOLR_STOPWORDS):
24     stopwords = set(
25         line.decode('utf-8').strip()
26         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
27 else:
28     stopwords = set()
29
30
31 class SolrIndex(object):
32     def __init__(self, mode=None):
33         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
34
35
36 class Snippets(object):
37     """
38     This class manages snippet files for indexed object (book)
39     the snippets are concatenated together, and their positions and
40     lengths are kept in lucene index fields.
41     """
42     SNIPPET_DIR = "snippets"
43
44     def __init__(self, book_id, revision=None):
45         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
46         self.book_id = book_id
47         self.revision = revision
48         self.file = None
49         self.position = None
50
51     @property
52     def path(self):
53         if self.revision:
54             fn = "%d.%d" % (self.book_id, self.revision)
55         else:
56             fn = "%d" % self.book_id
57
58         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
59
60     def open(self, mode='r'):
61         """
62         Open the snippet file. Call .close() afterwards.
63         """
64         if 'b' not in mode:
65             mode += 'b'
66
67         if 'w' in mode:
68             if os.path.exists(self.path):
69                 self.revision = 1
70                 while True:
71                     if not os.path.exists(self.path):
72                         break
73                     self.revision += 1
74
75         self.file = open(self.path, mode)
76         self.position = 0
77         return self
78
79     def add(self, snippet):
80         """
81         Append a snippet (unicode) to the snippet file.
82         Return a (position, length) tuple
83         """
84         txt = snippet.encode('utf-8')
85         l = len(txt)
86         self.file.write(txt)
87         pos = (self.position, l)
88         self.position += l
89         return pos
90
91     def get(self, pos):
92         """
93         Given a tuple of (position, length) return an unicode
94         of the snippet stored there.
95         """
96         self.file.seek(pos[0], 0)
97         txt = self.file.read(pos[1]).decode('utf-8')
98         return txt
99
100     def close(self):
101         """Close snippet file"""
102         if self.file:
103             self.file.close()
104
105     def remove(self):
106         self.revision = None
107         try:
108             os.unlink(self.path)
109             self.revision = 0
110             while True:
111                 self.revision += 1
112                 os.unlink(self.path)
113         except OSError:
114             pass
115
116
117 class Index(SolrIndex):
118     """
119     Class indexing books.
120     """
121     def __init__(self):
122         super(Index, self).__init__(mode='rw')
123
124     def delete_query(self, *queries):
125         """
126         index.delete(queries=...) doesn't work, so let's reimplement it
127         using deletion of list of uids.
128         """
129         uids = set()
130         for q in queries:
131             if isinstance(q, sunburnt.search.LuceneQuery):
132                 q = self.index.query(q)
133             q.field_limiter.update(['uid'])
134             st = 0
135             rows = 100
136             while True:
137                 ids = q.paginate(start=st, rows=rows).execute()
138                 if not len(ids):
139                     break
140                 for res in ids:
141                     uids.add(res['uid'])
142                 st += rows
143         if uids:
144             self.index.delete(uids)
145             return True
146         else:
147             return False
148
149     def index_tags(self, *tags, **kw):
150         """
151         Re-index global tag list.
152         Removes all tags from index, then index them again.
153         Indexed fields include: id, name (with and without polish stems), category
154         """
155         log.debug("Indexing tags")
156         remove_only = kw.get('remove_only', False)
157         # first, remove tags from index.
158         if tags:
159             tag_qs = []
160             for tag in tags:
161                 q_id = self.index.Q(tag_id=tag.id)
162
163                 if isinstance(tag, PDCounterAuthor):
164                     q_cat = self.index.Q(tag_category='pd_author')
165                 elif isinstance(tag, PDCounterBook):
166                     q_cat = self.index.Q(tag_category='pd_book')
167                 else:
168                     q_cat = self.index.Q(tag_category=tag.category)
169
170                 q_id_cat = self.index.Q(q_id & q_cat)
171                 tag_qs.append(q_id_cat)
172             self.delete_query(*tag_qs)
173         else:  # all
174             q = self.index.Q(tag_id__any=True)
175             self.delete_query(q)
176
177         if not remove_only:
178             # then add them [all or just one passed]
179             if not tags:
180                 tags = chain(
181                     catalogue.models.Tag.objects.exclude(category='set'),
182                     PDCounterAuthor.objects.all(),
183                     PDCounterBook.objects.all())
184
185             for tag in tags:
186                 if isinstance(tag, PDCounterAuthor):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.name,
190                         "tag_name_pl": tag.name,
191                         "tag_category": 'pd_author',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_a" % tag.id
194                         }
195                 elif isinstance(tag, PDCounterBook):
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.title,
199                         "tag_name_pl": tag.title,
200                         "tag_category": 'pd_book',
201                         "is_pdcounter": True,
202                         "uid": "tag%d_pd_b" % tag.id
203                         }
204                 else:
205                     doc = {
206                         "tag_id": int(tag.id),
207                         "tag_name": tag.name,
208                         "tag_name_pl": tag.name,
209                         "tag_category": tag.category,
210                         "is_pdcounter": False,
211                         "uid": "tag%d" % tag.id
212                         }
213                 self.index.add(doc)
214
215     def create_book_doc(self, book):
216         """
217         Create a lucene document referring book id.
218         """
219         doc = {'book_id': int(book.id)}
220         if book.parent is not None:
221             doc['parent_id'] = int(book.parent.id)
222         return doc
223
224     def remove_book(self, book_or_id, remove_snippets=True):
225         """Removes a book from search index.
226         book - Book instance."""
227         if isinstance(book_or_id, catalogue.models.Book):
228             book_id = book_or_id.id
229         else:
230             book_id = book_or_id
231
232         self.delete_query(self.index.Q(book_id=book_id))
233
234         if remove_snippets:
235             snippets = Snippets(book_id)
236             snippets.remove()
237
238     def index_book(self, book, book_info=None, overwrite=True):
239         """
240         Indexes the book.
241         Creates a lucene document for extracted metadata
242         and calls self.index_content() to index the contents of the book.
243         """
244         if overwrite:
245             # we don't remove snippets, since they might be still needed by
246             # threads using not reopened index
247             self.remove_book(book, remove_snippets=False)
248
249         book_doc = self.create_book_doc(book)
250         meta_fields = self.extract_metadata(book, book_info, dc_only=[
251             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
252         # let's not index it - it's only used for extracting publish date
253         if 'source_name' in meta_fields:
254             del meta_fields['source_name']
255
256         for n, f in meta_fields.items():
257             book_doc[n] = f
258
259         book_doc['uid'] = "book%s" % book_doc['book_id']
260         self.index.add(book_doc)
261         del book_doc
262         book_fields = {
263             'title': meta_fields['title'],
264             'authors': meta_fields['authors'],
265             'published_date': meta_fields['published_date']
266             }
267
268         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
269             if tag_name in meta_fields:
270                 book_fields[tag_name] = meta_fields[tag_name]
271
272         self.index_content(book, book_fields=book_fields)
273
274     master_tags = [
275         'opowiadanie',
276         'powiesc',
277         'dramat_wierszowany_l',
278         'dramat_wierszowany_lp',
279         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280         'wywiad',
281     ]
282
283     ignore_content_tags = [
284         'uwaga', 'extra', 'nota_red', 'abstrakt',
285         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
286         'didaskalia',
287         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
288     ]
289
290     footnote_tags = ['pa', 'pt', 'pr', 'pe']
291
292     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
293                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
294
295     published_date_re = re.compile("([0-9]+)[\]. ]*$")
296
297     def extract_metadata(self, book, book_info=None, dc_only=None):
298         """
299         Extract metadata from book and returns a map of fields keyed by fieldname
300         """
301         fields = {}
302
303         if book_info is None:
304             book_info = dcparser.parse(open(book.xml_file.path))
305
306         fields['slug'] = book.slug
307         fields['tags'] = [t.name for t in book.tags]
308         fields['is_book'] = True
309
310         # validator, name
311         for field in dcparser.BookInfo.FIELDS:
312             if dc_only and field.name not in dc_only:
313                 continue
314             if hasattr(book_info, field.name):
315                 if not getattr(book_info, field.name):
316                     continue
317                 # since no type information is available, we use validator
318                 type_indicator = field.validator
319                 if type_indicator == dcparser.as_unicode:
320                     s = getattr(book_info, field.name)
321                     if field.multiple:
322                         s = ', '.join(s)
323                     fields[field.name] = s
324                 elif type_indicator == dcparser.as_person:
325                     p = getattr(book_info, field.name)
326                     if isinstance(p, dcparser.Person):
327                         persons = unicode(p)
328                     else:
329                         persons = ', '.join(map(unicode, p))
330                     fields[field.name] = persons
331                 elif type_indicator == dcparser.as_date:
332                     dt = getattr(book_info, field.name)
333                     fields[field.name] = dt
334
335         # get published date
336         pd = None
337         if hasattr(book_info, 'source_name') and book_info.source_name:
338             match = self.published_date_re.search(book_info.source_name)
339             if match is not None:
340                 pd = str(match.groups()[0])
341         if not pd:
342             pd = ""
343         fields["published_date"] = pd
344
345         return fields
346
347     # def add_gaps(self, fields, fieldname):
348     #     """
349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
351     #     """
352     #     def gap():
353     #         while True:
354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
356
357     def get_master(self, root):
358         """
359         Returns the first master tag from an etree.
360         """
361         for master in root.iter():
362             if master.tag in self.master_tags:
363                 return master
364
365     def index_content(self, book, book_fields):
366         """
367         Walks the book XML and extract content from it.
368         Adds parts for each header tag and for each fragment.
369         """
370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
371         root = wld.edoc.getroot()
372
373         master = self.get_master(root)
374         if master is None:
375             return []
376
377         def walker(node):
378             if node.tag not in self.ignore_content_tags:
379                 yield node, None, None
380                 if node.text is not None:
381                     yield None, node.text, None
382                 for child in list(node):
383                     for b, t, e in walker(child):
384                         yield b, t, e
385                 yield None, None, node
386
387             if node.tail is not None:
388                 yield None, node.tail, None
389             return
390
391         def fix_format(text):
392             # separator = [u" ", u"\t", u".", u";", u","]
393             if isinstance(text, list):
394                 # need to join it first
395                 text = filter(lambda s: s is not None, content)
396                 text = u' '.join(text)
397                 # for i in range(len(text)):
398                 #     if i > 0:
399                 #         if text[i][0] not in separator\
400                 #             and text[i - 1][-1] not in separator:
401                 #          text.insert(i, u" ")
402
403             return re.sub("(?m)/$", "", text)
404
405         def add_part(snippets, **fields):
406             doc = self.create_book_doc(book)
407             for n, v in book_fields.items():
408                 doc[n] = v
409
410             doc['header_index'] = fields["header_index"]
411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
412             doc['header_type'] = fields['header_type']
413
414             doc['text'] = fields['text']
415
416             # snippets
417             snip_pos = snippets.add(fields["text"])
418
419             doc['snippets_position'] = snip_pos[0]
420             doc['snippets_length'] = snip_pos[1]
421             if snippets.revision:
422                 doc["snippets_revision"] = snippets.revision
423
424             if 'fragment_anchor' in fields:
425                 doc["fragment_anchor"] = fields['fragment_anchor']
426
427             if 'themes' in fields:
428                 doc['themes'] = fields['themes']
429             doc['uid'] = "part%s-%s-%s-%s" % (
430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
431             return doc
432
433         fragments = {}
434         snippets = Snippets(book.id).open('w')
435         try:
436             for header, position in zip(list(master), range(len(master))):
437
438                 if header.tag in self.skip_header_tags:
439                     continue
440                 if header.tag is etree.Comment:
441                     continue
442
443                 # section content
444                 content = []
445                 footnote = []
446
447                 def all_content(text):
448                     for frag in fragments.values():
449                         frag['text'].append(text)
450                     content.append(text)
451                 handle_text = [all_content]
452
453                 for start, text, end in walker(header):
454                     # handle footnotes
455                     if start is not None and start.tag in self.footnote_tags:
456                         footnote = []
457
458                         def collect_footnote(t):
459                             footnote.append(t)
460
461                         handle_text.append(collect_footnote)
462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463                         handle_text.pop()
464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
465                                        text=u''.join(footnote),
466                                        is_footnote=True)
467                         self.index.add(doc)
468                         footnote = []
469
470                     # handle fragments and themes.
471                     if start is not None and start.tag == 'begin':
472                         fid = start.attrib['id'][1:]
473                         fragments[fid] = {
474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(lambda text: None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if not frag['themes']:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        text=fix_format(frag['text']),
500                                        themes=frag['themes'])
501                         self.index.add(doc)
502
503                         # Collect content.
504
505                     if text is not None and handle_text is not []:
506                         hdl = handle_text[-1]
507                         hdl(text)
508
509                         # in the end, add a section text.
510                 doc = add_part(snippets, header_index=position,
511                                header_type=header.tag, text=fix_format(content))
512
513                 self.index.add(doc)
514
515         finally:
516             snippets.close()
517
518
519 class SearchResult(object):
520     def __init__(self, doc, how_found=None, query_terms=None):
521         self.boost = 1.0
522         self._hits = []
523         self._processed_hits = None  # processed hits
524         self.snippets = []
525         self.query_terms = query_terms
526         self._book = None
527
528         if 'score' in doc:
529             self._score = doc['score']
530         else:
531             self._score = 0
532
533         self.book_id = int(doc["book_id"])
534
535         try:
536             self.published_date = int(doc.get("published_date"))
537         except ValueError:
538             self.published_date = 0
539
540         # content hits
541         header_type = doc.get("header_type", None)
542         # we have a content hit in some header of fragment
543         if header_type is not None:
544             sec = (header_type, int(doc["header_index"]))
545             header_span = doc['header_span']
546             header_span = header_span is not None and int(header_span) or 1
547             fragment = doc.get("fragment_anchor", None)
548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549             snippets_rev = doc.get('snippets_revision', None)
550
551             hit = (sec + (header_span,), fragment, self._score, {
552                 'how_found': how_found,
553                 'snippets_pos': snippets_pos,
554                 'snippets_revision': snippets_rev,
555                 'themes': doc.get('themes', []),
556                 'themes_pl': doc.get('themes_pl', [])
557                 })
558
559             self._hits.append(hit)
560
561     def __unicode__(self):
562         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
563             (self.book_id, len(self._hits),
564              len(self._processed_hits) if self._processed_hits else -1,
565              self._score, len(self.snippets))
566
567     def __str__(self):
568         return unicode(self).encode('utf-8')
569
570     @property
571     def score(self):
572         return self._score * self.boost
573
574     def merge(self, other):
575         if self.book_id != other.book_id:
576             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
577         self._hits += other._hits
578         if other.score > self.score:
579             self._score = other._score
580         return self
581
582     def get_book(self):
583         if self._book is not None:
584             return self._book
585         self._book = catalogue.models.Book.objects.get(id=self.book_id)
586         return self._book
587
588     book = property(get_book)
589
590     POSITION = 0
591     FRAGMENT = 1
592     POSITION_INDEX = 1
593     POSITION_SPAN = 2
594     SCORE = 2
595     OTHER = 3
596
597     @property
598     def hits(self):
599         if self._processed_hits is not None:
600             return self._processed_hits
601
602         # to sections and fragments
603         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
604
605         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
606
607         # sections not covered by fragments
608         sect = filter(lambda s: 0 == len(filter(
609             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
610                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
611
612         def remove_duplicates(lst, keyfn, compare):
613             els = {}
614             for e in lst:
615                 eif = keyfn(e)
616                 if eif in els:
617                     if compare(els[eif], e) >= 1:
618                         continue
619                 els[eif] = e
620             return els.values()
621
622         # remove fragments with duplicated fid's and duplicated snippets
623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
626
627         # remove duplicate sections
628         sections = {}
629
630         for s in sect:
631             si = s[self.POSITION][self.POSITION_INDEX]
632             # skip existing
633             if si in sections:
634                 if sections[si]['score'] >= s[self.SCORE]:
635                     continue
636
637             m = {'score': s[self.SCORE],
638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
639                  }
640             m.update(s[self.OTHER])
641             sections[si] = m
642
643         hits = sections.values()
644
645         for f in frags:
646             try:
647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
648             except catalogue.models.Fragment.DoesNotExist:
649                 # stale index
650                 continue
651             # Figure out if we were searching for a token matching some word in theme name.
652             themes = frag.tags.filter(category='theme')
653             themes_hit = set()
654             if self.query_terms is not None:
655                 for i in range(0, len(f[self.OTHER]['themes'])):
656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
657                     tms = map(unicode.lower, tms)
658                     for qt in self.query_terms:
659                         if qt in tms:
660                             themes_hit.add(f[self.OTHER]['themes'][i])
661                             break
662
663             def theme_by_name(n):
664                 th = filter(lambda t: t.name == n, themes)
665                 if th:
666                     return th[0]
667                 else:
668                     return None
669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
670
671             m = {'score': f[self.SCORE],
672                  'fragment': frag,
673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
674                  'themes': themes,
675                  'themes_hit': themes_hit
676                  }
677             m.update(f[self.OTHER])
678             hits.append(m)
679
680         hits.sort(key=lambda h: h['score'], reverse=True)
681
682         self._processed_hits = hits
683
684         return hits
685
686     @staticmethod
687     def aggregate(*result_lists):
688         books = {}
689         for rl in result_lists:
690             for r in rl:
691                 if r.book_id in books:
692                     books[r.book_id].merge(r)
693                 else:
694                     books[r.book_id] = r
695         return books.values()
696
697     def __cmp__(self, other):
698         c = cmp(self.score, other.score)
699         if c == 0:
700             # this is inverted, because earlier date is better
701             return cmp(other.published_date, self.published_date)
702         else:
703             return c
704
705     def __len__(self):
706         return len(self.hits)
707
708     def snippet_pos(self, idx=0):
709         return self.hits[idx]['snippets_pos']
710
711     def snippet_revision(self, idx=0):
712         try:
713             return self.hits[idx]['snippets_revision']
714         except (IndexError, KeyError):
715             return None
716
717
718 class Search(SolrIndex):
719     """
720     Search facilities.
721     """
722     def __init__(self, default_field="text"):
723         super(Search, self).__init__(mode='r')
724
725     def make_term_query(self, query, field='text', modal=operator.or_):
726         """
727         Returns term queries joined by boolean query.
728         modal - applies to boolean query
729         fuzzy - should the query by fuzzy.
730         """
731         if query is None:
732             query = ''
733         q = self.index.Q()
734         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
735
736         return q
737
738     def search_words(self, words, fields, book=True):
739         filters = []
740         for word in words:
741             if word not in stopwords:
742                 word_filter = None
743                 for field in fields:
744                     q = self.index.Q(**{field: word})
745                     if word_filter is None:
746                         word_filter = q
747                     else:
748                         word_filter |= q
749                 filters.append(word_filter)
750         if not filters:
751             return []
752         if book:
753             query = self.index.query(is_book=True)
754         else:
755             query = self.index.query()
756         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
757         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
758
759     def get_snippets(self, searchresult, query, field='text', num=1):
760         """
761         Returns a snippet for found scoreDoc.
762         """
763         maxnum = len(searchresult)
764         if num is None or num < 0 or num > maxnum:
765             num = maxnum
766         book_id = searchresult.book_id
767         revision = searchresult.snippet_revision()
768         snippets = Snippets(book_id, revision=revision)
769         snips = [None] * maxnum
770         try:
771             snippets.open()
772             idx = 0
773             while idx < maxnum and num > 0:
774                 position, length = searchresult.snippet_pos(idx)
775                 if position is None or length is None:
776                     continue
777                 text = snippets.get((int(position),
778                                      int(length)))
779                 snip = self.index.highlight(text=text, field=field, q=query)
780                 if snip not in snips:
781                     snips[idx] = snip
782                     if snip:
783                         num -= 1
784                 idx += 1
785
786         except IOError, e:
787             book = catalogue.models.Book.objects.filter(id=book_id)
788             if not book:
789                 log.error("Book does not exist for book id = %d" % book_id)
790             elif not book.get().children.exists():
791                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
792             return []
793         finally:
794             snippets.close()
795
796             # remove verse end markers..
797         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
798
799         searchresult.snippets = snips
800
801         return snips
802
803     @staticmethod
804     def apply_filters(query, filters):
805         """
806         Apply filters to a query
807         """
808         if filters is None:
809             filters = []
810         filters = filter(lambda x: x is not None, filters)
811         for f in filters:
812             query = query.query(f)
813         return query
814
815
816 if getattr(settings, 'SEARCH_MOCK', False):
817     from .mock_search import Search