src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23 if os.path.isfile(settings.SOLR_STOPWORDS):
  24     stopwords = set(
  25         line.decode('utf-8').strip()
  26         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  27 else:
  28     stopwords = set()
  29
  30
  31 class SolrIndex(object):
  32     def __init__(self, mode=None):
  33         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  34
  35
  36 class Snippets(object):
  37     """
  38     This class manages snippet files for indexed object (book)
  39     the snippets are concatenated together, and their positions and
  40     lengths are kept in lucene index fields.
  41     """
  42     SNIPPET_DIR = "snippets"
  43
  44     def __init__(self, book_id, revision=None):
  45         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  46         self.book_id = book_id
  47         self.revision = revision
  48         self.file = None
  49         self.position = None
  50
  51     @property
  52     def path(self):
  53         if self.revision:
  54             fn = "%d.%d" % (self.book_id, self.revision)
  55         else:
  56             fn = "%d" % self.book_id
  57
  58         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  59
  60     def open(self, mode='r'):
  61         """
  62         Open the snippet file. Call .close() afterwards.
  63         """
  64         if 'b' not in mode:
  65             mode += 'b'
  66
  67         if 'w' in mode:
  68             if os.path.exists(self.path):
  69                 self.revision = 1
  70                 while True:
  71                     if not os.path.exists(self.path):
  72                         break
  73                     self.revision += 1
  74
  75         self.file = open(self.path, mode)
  76         self.position = 0
  77         return self
  78
  79     def add(self, snippet):
  80         """
  81         Append a snippet (unicode) to the snippet file.
  82         Return a (position, length) tuple
  83         """
  84         txt = snippet.encode('utf-8')
  85         l = len(txt)
  86         self.file.write(txt)
  87         pos = (self.position, l)
  88         self.position += l
  89         return pos
  90
  91     def get(self, pos):
  92         """
  93         Given a tuple of (position, length) return an unicode
  94         of the snippet stored there.
  95         """
  96         self.file.seek(pos[0], 0)
  97         txt = self.file.read(pos[1]).decode('utf-8')
  98         return txt
  99
 100     def close(self):
 101         """Close snippet file"""
 102         if self.file:
 103             self.file.close()
 104
 105     def remove(self):
 106         self.revision = None
 107         try:
 108             os.unlink(self.path)
 109             self.revision = 0
 110             while True:
 111                 self.revision += 1
 112                 os.unlink(self.path)
 113         except OSError:
 114             pass
 115
 116
 117 class Index(SolrIndex):
 118     """
 119     Class indexing books.
 120     """
 121     def __init__(self):
 122         super(Index, self).__init__(mode='rw')
 123
 124     def delete_query(self, *queries):
 125         """
 126         index.delete(queries=...) doesn't work, so let's reimplement it
 127         using deletion of list of uids.
 128         """
 129         uids = set()
 130         for q in queries:
 131             if isinstance(q, sunburnt.search.LuceneQuery):
 132                 q = self.index.query(q)
 133             q.field_limiter.update(['uid'])
 134             st = 0
 135             rows = 100
 136             while True:
 137                 ids = q.paginate(start=st, rows=rows).execute()
 138                 if not len(ids):
 139                     break
 140                 for res in ids:
 141                     uids.add(res['uid'])
 142                 st += rows
 143         if uids:
 144             self.index.delete(uids)
 145             return True
 146         else:
 147             return False
 148
 149     def index_tags(self, *tags, **kw):
 150         """
 151         Re-index global tag list.
 152         Removes all tags from index, then index them again.
 153         Indexed fields include: id, name (with and without polish stems), category
 154         """
 155         log.debug("Indexing tags")
 156         remove_only = kw.get('remove_only', False)
 157         # first, remove tags from index.
 158         if tags:
 159             tag_qs = []
 160             for tag in tags:
 161                 q_id = self.index.Q(tag_id=tag.id)
 162
 163                 if isinstance(tag, PDCounterAuthor):
 164                     q_cat = self.index.Q(tag_category='pd_author')
 165                 elif isinstance(tag, PDCounterBook):
 166                     q_cat = self.index.Q(tag_category='pd_book')
 167                 else:
 168                     q_cat = self.index.Q(tag_category=tag.category)
 169
 170                 q_id_cat = self.index.Q(q_id & q_cat)
 171                 tag_qs.append(q_id_cat)
 172             self.delete_query(*tag_qs)
 173         else:  # all
 174             q = self.index.Q(tag_id__any=True)
 175             self.delete_query(q)
 176
 177         if not remove_only:
 178             # then add them [all or just one passed]
 179             if not tags:
 180                 tags = chain(
 181                     catalogue.models.Tag.objects.exclude(category='set'),
 182                     PDCounterAuthor.objects.all(),
 183                     PDCounterBook.objects.all())
 184
 185             for tag in tags:
 186                 if isinstance(tag, PDCounterAuthor):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.name,
 190                         "tag_name_pl": tag.name,
 191                         "tag_category": 'pd_author',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_a" % tag.id
 194                         }
 195                 elif isinstance(tag, PDCounterBook):
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.title,
 199                         "tag_name_pl": tag.title,
 200                         "tag_category": 'pd_book',
 201                         "is_pdcounter": True,
 202                         "uid": "tag%d_pd_b" % tag.id
 203                         }
 204                 else:
 205                     doc = {
 206                         "tag_id": int(tag.id),
 207                         "tag_name": tag.name,
 208                         "tag_name_pl": tag.name,
 209                         "tag_category": tag.category,
 210                         "is_pdcounter": False,
 211                         "uid": "tag%d" % tag.id
 212                         }
 213                 self.index.add(doc)
 214
 215     def create_book_doc(self, book):
 216         """
 217         Create a lucene document referring book id.
 218         """
 219         doc = {'book_id': int(book.id)}
 220         if book.parent is not None:
 221             doc['parent_id'] = int(book.parent.id)
 222         return doc
 223
 224     def remove_book(self, book_or_id, remove_snippets=True):
 225         """Removes a book from search index.
 226         book - Book instance."""
 227         if isinstance(book_or_id, catalogue.models.Book):
 228             book_id = book_or_id.id
 229         else:
 230             book_id = book_or_id
 231
 232         self.delete_query(self.index.Q(book_id=book_id))
 233
 234         if remove_snippets:
 235             snippets = Snippets(book_id)
 236             snippets.remove()
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             # we don't remove snippets, since they might be still needed by
 246             # threads using not reopened index
 247             self.remove_book(book, remove_snippets=False)
 248
 249         book_doc = self.create_book_doc(book)
 250         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 251             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 252         # let's not index it - it's only used for extracting publish date
 253         if 'source_name' in meta_fields:
 254             del meta_fields['source_name']
 255
 256         for n, f in meta_fields.items():
 257             book_doc[n] = f
 258
 259         book_doc['uid'] = "book%s" % book_doc['book_id']
 260         self.index.add(book_doc)
 261         del book_doc
 262         book_fields = {
 263             'title': meta_fields['title'],
 264             'authors': meta_fields['authors'],
 265             'published_date': meta_fields['published_date']
 266             }
 267
 268         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 269             if tag_name in meta_fields:
 270                 book_fields[tag_name] = meta_fields[tag_name]
 271
 272         self.index_content(book, book_fields=book_fields)
 273
 274     master_tags = [
 275         'opowiadanie',
 276         'powiesc',
 277         'dramat_wierszowany_l',
 278         'dramat_wierszowany_lp',
 279         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 280         'wywiad',
 281     ]
 282
 283     ignore_content_tags = [
 284         'uwaga', 'extra', 'nota_red', 'abstrakt',
 285         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 286         'didaskalia',
 287         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 288     ]
 289
 290     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 291
 292     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 293                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 294
 295     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 296
 297     def extract_metadata(self, book, book_info=None, dc_only=None):
 298         """
 299         Extract metadata from book and returns a map of fields keyed by fieldname
 300         """
 301         fields = {}
 302
 303         if book_info is None:
 304             book_info = dcparser.parse(open(book.xml_file.path))
 305
 306         fields['slug'] = book.slug
 307         fields['is_book'] = True
 308
 309         # validator, name
 310         for field in dcparser.BookInfo.FIELDS:
 311             if dc_only and field.name not in dc_only:
 312                 continue
 313             if hasattr(book_info, field.name):
 314                 if not getattr(book_info, field.name):
 315                     continue
 316                 # since no type information is available, we use validator
 317                 type_indicator = field.validator
 318                 if type_indicator == dcparser.as_unicode:
 319                     s = getattr(book_info, field.name)
 320                     if field.multiple:
 321                         s = ', '.join(s)
 322                     fields[field.name] = s
 323                 elif type_indicator == dcparser.as_person:
 324                     p = getattr(book_info, field.name)
 325                     if isinstance(p, dcparser.Person):
 326                         persons = unicode(p)
 327                     else:
 328                         persons = ', '.join(map(unicode, p))
 329                     fields[field.name] = persons
 330                 elif type_indicator == dcparser.as_date:
 331                     dt = getattr(book_info, field.name)
 332                     fields[field.name] = dt
 333
 334         # get published date
 335         pd = None
 336         if hasattr(book_info, 'source_name') and book_info.source_name:
 337             match = self.published_date_re.search(book_info.source_name)
 338             if match is not None:
 339                 pd = str(match.groups()[0])
 340         if not pd:
 341             pd = ""
 342         fields["published_date"] = pd
 343
 344         return fields
 345
 346     # def add_gaps(self, fields, fieldname):
 347     #     """
 348     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 349     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 350     #     """
 351     #     def gap():
 352     #         while True:
 353     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 354     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 355
 356     def get_master(self, root):
 357         """
 358         Returns the first master tag from an etree.
 359         """
 360         for master in root.iter():
 361             if master.tag in self.master_tags:
 362                 return master
 363
 364     def index_content(self, book, book_fields):
 365         """
 366         Walks the book XML and extract content from it.
 367         Adds parts for each header tag and for each fragment.
 368         """
 369         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 370         root = wld.edoc.getroot()
 371
 372         master = self.get_master(root)
 373         if master is None:
 374             return []
 375
 376         def walker(node):
 377             if node.tag not in self.ignore_content_tags:
 378                 yield node, None, None
 379                 if node.text is not None:
 380                     yield None, node.text, None
 381                 for child in list(node):
 382                     for b, t, e in walker(child):
 383                         yield b, t, e
 384                 yield None, None, node
 385
 386             if node.tail is not None:
 387                 yield None, node.tail, None
 388             return
 389
 390         def fix_format(text):
 391             # separator = [u" ", u"\t", u".", u";", u","]
 392             if isinstance(text, list):
 393                 # need to join it first
 394                 text = filter(lambda s: s is not None, content)
 395                 text = u' '.join(text)
 396                 # for i in range(len(text)):
 397                 #     if i > 0:
 398                 #         if text[i][0] not in separator\
 399                 #             and text[i - 1][-1] not in separator:
 400                 #          text.insert(i, u" ")
 401
 402             return re.sub("(?m)/$", "", text)
 403
 404         def add_part(snippets, **fields):
 405             doc = self.create_book_doc(book)
 406             for n, v in book_fields.items():
 407                 doc[n] = v
 408
 409             doc['header_index'] = fields["header_index"]
 410             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 411             doc['header_type'] = fields['header_type']
 412
 413             doc['text'] = fields['text']
 414
 415             # snippets
 416             snip_pos = snippets.add(fields["text"])
 417
 418             doc['snippets_position'] = snip_pos[0]
 419             doc['snippets_length'] = snip_pos[1]
 420             if snippets.revision:
 421                 doc["snippets_revision"] = snippets.revision
 422
 423             if 'fragment_anchor' in fields:
 424                 doc["fragment_anchor"] = fields['fragment_anchor']
 425
 426             if 'themes' in fields:
 427                 doc['themes'] = fields['themes']
 428             doc['uid'] = "part%s-%s-%s-%s" % (
 429                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 430             return doc
 431
 432         fragments = {}
 433         snippets = Snippets(book.id).open('w')
 434         try:
 435             for header, position in zip(list(master), range(len(master))):
 436
 437                 if header.tag in self.skip_header_tags:
 438                     continue
 439                 if header.tag is etree.Comment:
 440                     continue
 441
 442                 # section content
 443                 content = []
 444                 footnote = []
 445
 446                 def all_content(text):
 447                     for frag in fragments.values():
 448                         frag['text'].append(text)
 449                     content.append(text)
 450                 handle_text = [all_content]
 451
 452                 for start, text, end in walker(header):
 453                     # handle footnotes
 454                     if start is not None and start.tag in self.footnote_tags:
 455                         footnote = []
 456
 457                         def collect_footnote(t):
 458                             footnote.append(t)
 459
 460                         handle_text.append(collect_footnote)
 461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 462                         handle_text.pop()
 463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 464                                        text=u''.join(footnote),
 465                                        is_footnote=True)
 466                         self.index.add(doc)
 467                         footnote = []
 468
 469                     # handle fragments and themes.
 470                     if start is not None and start.tag == 'begin':
 471                         fid = start.attrib['id'][1:]
 472                         fragments[fid] = {
 473                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 474
 475                     # themes for this fragment
 476                     elif start is not None and start.tag == 'motyw':
 477                         fid = start.attrib['id'][1:]
 478                         handle_text.append(lambda text: None)
 479                         if start.text is not None:
 480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 481                     elif end is not None and end.tag == 'motyw':
 482                         handle_text.pop()
 483
 484                     elif start is not None and start.tag == 'end':
 485                         fid = start.attrib['id'][1:]
 486                         if fid not in fragments:
 487                             continue  # a broken <end> node, skip it
 488                         frag = fragments[fid]
 489                         if not frag['themes']:
 490                             continue  # empty themes list.
 491                         del fragments[fid]
 492
 493                         doc = add_part(snippets,
 494                                        header_type=frag['start_header'],
 495                                        header_index=frag['start_section'],
 496                                        header_span=position - frag['start_section'] + 1,
 497                                        fragment_anchor=fid,
 498                                        text=fix_format(frag['text']),
 499                                        themes=frag['themes'])
 500                         self.index.add(doc)
 501
 502                         # Collect content.
 503
 504                     if text is not None and handle_text is not []:
 505                         hdl = handle_text[-1]
 506                         hdl(text)
 507
 508                         # in the end, add a section text.
 509                 doc = add_part(snippets, header_index=position,
 510                                header_type=header.tag, text=fix_format(content))
 511
 512                 self.index.add(doc)
 513
 514         finally:
 515             snippets.close()
 516
 517
 518 class SearchResult(object):
 519     def __init__(self, doc, how_found=None, query_terms=None):
 520         self.boost = 1.0
 521         self._hits = []
 522         self._processed_hits = None  # processed hits
 523         self.snippets = []
 524         self.query_terms = query_terms
 525         self._book = None
 526
 527         if 'score' in doc:
 528             self._score = doc['score']
 529         else:
 530             self._score = 0
 531
 532         self.book_id = int(doc["book_id"])
 533
 534         try:
 535             self.published_date = int(doc.get("published_date"))
 536         except ValueError:
 537             self.published_date = 0
 538
 539         # content hits
 540         header_type = doc.get("header_type", None)
 541         # we have a content hit in some header of fragment
 542         if header_type is not None:
 543             sec = (header_type, int(doc["header_index"]))
 544             header_span = doc['header_span']
 545             header_span = header_span is not None and int(header_span) or 1
 546             fragment = doc.get("fragment_anchor", None)
 547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 548             snippets_rev = doc.get('snippets_revision', None)
 549
 550             hit = (sec + (header_span,), fragment, self._score, {
 551                 'how_found': how_found,
 552                 'snippets_pos': snippets_pos,
 553                 'snippets_revision': snippets_rev,
 554                 'themes': doc.get('themes', []),
 555                 'themes_pl': doc.get('themes_pl', [])
 556                 })
 557
 558             self._hits.append(hit)
 559
 560     @classmethod
 561     def from_book(cls, book, how_found=None, query_terms=None):
 562         doc = {
 563             'score': book.popularity.count,
 564             'book_id': book.id,
 565             'published_date': 0,
 566         }
 567         result = cls(doc, how_found=how_found, query_terms=query_terms)
 568         result._book = book
 569         return result
 570
 571     def __unicode__(self):
 572         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 573             (self.book_id, len(self._hits),
 574              len(self._processed_hits) if self._processed_hits else -1,
 575              self._score, len(self.snippets))
 576
 577     def __str__(self):
 578         return unicode(self).encode('utf-8')
 579
 580     @property
 581     def score(self):
 582         return self._score * self.boost
 583
 584     def merge(self, other):
 585         if self.book_id != other.book_id:
 586             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 587         self._hits += other._hits
 588         self._score += max(other._score, 0)
 589         return self
 590
 591     def get_book(self):
 592         if self._book is not None:
 593             return self._book
 594         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 595         return self._book
 596
 597     book = property(get_book)
 598
 599     POSITION = 0
 600     FRAGMENT = 1
 601     POSITION_INDEX = 1
 602     POSITION_SPAN = 2
 603     SCORE = 2
 604     OTHER = 3
 605
 606     @property
 607     def hits(self):
 608         if self._processed_hits is not None:
 609             return self._processed_hits
 610
 611         # to sections and fragments
 612         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 613
 614         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 615
 616         # sections not covered by fragments
 617         sect = filter(lambda s: 0 == len(filter(
 618             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 619                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 620
 621         def remove_duplicates(lst, keyfn, compare):
 622             els = {}
 623             for e in lst:
 624                 eif = keyfn(e)
 625                 if eif in els:
 626                     if compare(els[eif], e) >= 1:
 627                         continue
 628                 els[eif] = e
 629             return els.values()
 630
 631         # remove fragments with duplicated fid's and duplicated snippets
 632         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 633         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 634         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 635
 636         # remove duplicate sections
 637         sections = {}
 638
 639         for s in sect:
 640             si = s[self.POSITION][self.POSITION_INDEX]
 641             # skip existing
 642             if si in sections:
 643                 if sections[si]['score'] >= s[self.SCORE]:
 644                     continue
 645
 646             m = {'score': s[self.SCORE],
 647                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 648                  }
 649             m.update(s[self.OTHER])
 650             sections[si] = m
 651
 652         hits = sections.values()
 653
 654         for f in frags:
 655             try:
 656                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 657             except catalogue.models.Fragment.DoesNotExist:
 658                 # stale index
 659                 continue
 660             # Figure out if we were searching for a token matching some word in theme name.
 661             themes = frag.tags.filter(category='theme')
 662             themes_hit = set()
 663             if self.query_terms is not None:
 664                 for i in range(0, len(f[self.OTHER]['themes'])):
 665                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 666                     tms = map(unicode.lower, tms)
 667                     for qt in self.query_terms:
 668                         if qt in tms:
 669                             themes_hit.add(f[self.OTHER]['themes'][i])
 670                             break
 671
 672             def theme_by_name(n):
 673                 th = filter(lambda t: t.name == n, themes)
 674                 if th:
 675                     return th[0]
 676                 else:
 677                     return None
 678             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 679
 680             m = {'score': f[self.SCORE],
 681                  'fragment': frag,
 682                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 683                  'themes': themes,
 684                  'themes_hit': themes_hit
 685                  }
 686             m.update(f[self.OTHER])
 687             hits.append(m)
 688
 689         hits.sort(key=lambda h: h['score'], reverse=True)
 690
 691         self._processed_hits = hits
 692
 693         return hits
 694
 695     @staticmethod
 696     def aggregate(*result_lists):
 697         books = {}
 698         for rl in result_lists:
 699             for r in rl:
 700                 if r.book_id in books:
 701                     books[r.book_id].merge(r)
 702                 else:
 703                     books[r.book_id] = r
 704         return books.values()
 705
 706     def __cmp__(self, other):
 707         c = cmp(self.score, other.score)
 708         if c == 0:
 709             # this is inverted, because earlier date is better
 710             return cmp(other.published_date, self.published_date)
 711         else:
 712             return c
 713
 714     def __len__(self):
 715         return len(self.hits)
 716
 717     def snippet_pos(self, idx=0):
 718         return self.hits[idx]['snippets_pos']
 719
 720     def snippet_revision(self, idx=0):
 721         try:
 722             return self.hits[idx]['snippets_revision']
 723         except (IndexError, KeyError):
 724             return None
 725
 726
 727 class Search(SolrIndex):
 728     """
 729     Search facilities.
 730     """
 731     def __init__(self, default_field="text"):
 732         super(Search, self).__init__(mode='r')
 733
 734     def make_term_query(self, query, field='text', modal=operator.or_):
 735         """
 736         Returns term queries joined by boolean query.
 737         modal - applies to boolean query
 738         fuzzy - should the query by fuzzy.
 739         """
 740         if query is None:
 741             query = ''
 742         q = self.index.Q()
 743         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 744
 745         return q
 746
 747     def search_by_author(self, words):
 748         from catalogue.models import Book
 749         books = Book.objects.filter(parent=None).order_by('-popularity__count')
 750         for word in words:
 751             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 752         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 753
 754     def search_words(self, words, fields, book=True):
 755         if book and fields == ['authors']:
 756             return self.search_by_author(words)
 757         filters = []
 758         for word in words:
 759             if book or (word not in stopwords):
 760                 word_filter = None
 761                 for field in fields:
 762                     q = self.index.Q(**{field: word})
 763                     if word_filter is None:
 764                         word_filter = q
 765                     else:
 766                         word_filter |= q
 767                 filters.append(word_filter)
 768         if not filters:
 769             return []
 770         if book:
 771             query = self.index.query(is_book=True)
 772         else:
 773             query = self.index.query()
 774         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 775         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
 776
 777     def get_snippets(self, searchresult, query, field='text', num=1):
 778         """
 779         Returns a snippet for found scoreDoc.
 780         """
 781         maxnum = len(searchresult)
 782         if num is None or num < 0 or num > maxnum:
 783             num = maxnum
 784         book_id = searchresult.book_id
 785         revision = searchresult.snippet_revision()
 786         snippets = Snippets(book_id, revision=revision)
 787         snips = [None] * maxnum
 788         try:
 789             snippets.open()
 790             idx = 0
 791             while idx < maxnum and num > 0:
 792                 position, length = searchresult.snippet_pos(idx)
 793                 if position is None or length is None:
 794                     continue
 795                 text = snippets.get((int(position),
 796                                      int(length)))
 797                 snip = self.index.highlight(text=text, field=field, q=query)
 798                 if snip not in snips:
 799                     snips[idx] = snip
 800                     if snip:
 801                         num -= 1
 802                 idx += 1
 803
 804         except IOError, e:
 805             book = catalogue.models.Book.objects.filter(id=book_id)
 806             if not book:
 807                 log.error("Book does not exist for book id = %d" % book_id)
 808             elif not book.get().children.exists():
 809                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 810             return []
 811         finally:
 812             snippets.close()
 813
 814             # remove verse end markers..
 815         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 816
 817         searchresult.snippets = snips
 818
 819         return snips
 820
 821     @staticmethod
 822     def apply_filters(query, filters):
 823         """
 824         Apply filters to a query
 825         """
 826         if filters is None:
 827             filters = []
 828         filters = filter(lambda x: x is not None, filters)
 829         for f in filters:
 830             query = query.query(f)
 831         return query
 832
 833
 834 if getattr(settings, 'SEARCH_MOCK', False):
 835     from .mock_search import Search