src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23 if os.path.isfile(settings.SOLR_STOPWORDS):
  24     stopwords = set(
  25         line.decode('utf-8').strip()
  26         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  27 else:
  28     stopwords = set()
  29
  30
  31 class SolrIndex(object):
  32     def __init__(self, mode=None):
  33         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  34
  35
  36 class Snippets(object):
  37     """
  38     This class manages snippet files for indexed object (book)
  39     the snippets are concatenated together, and their positions and
  40     lengths are kept in lucene index fields.
  41     """
  42     SNIPPET_DIR = "snippets"
  43
  44     def __init__(self, book_id, revision=None):
  45         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  46         self.book_id = book_id
  47         self.revision = revision
  48         self.file = None
  49         self.position = None
  50
  51     @property
  52     def path(self):
  53         if self.revision:
  54             fn = "%d.%d" % (self.book_id, self.revision)
  55         else:
  56             fn = "%d" % self.book_id
  57
  58         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  59
  60     def open(self, mode='r'):
  61         """
  62         Open the snippet file. Call .close() afterwards.
  63         """
  64         if 'b' not in mode:
  65             mode += 'b'
  66
  67         if 'w' in mode:
  68             if os.path.exists(self.path):
  69                 self.revision = 1
  70                 while True:
  71                     if not os.path.exists(self.path):
  72                         break
  73                     self.revision += 1
  74
  75         self.file = open(self.path, mode)
  76         self.position = 0
  77         return self
  78
  79     def add(self, snippet):
  80         """
  81         Append a snippet (unicode) to the snippet file.
  82         Return a (position, length) tuple
  83         """
  84         txt = snippet.encode('utf-8')
  85         l = len(txt)
  86         self.file.write(txt)
  87         pos = (self.position, l)
  88         self.position += l
  89         return pos
  90
  91     def get(self, pos):
  92         """
  93         Given a tuple of (position, length) return an unicode
  94         of the snippet stored there.
  95         """
  96         self.file.seek(pos[0], 0)
  97         txt = self.file.read(pos[1]).decode('utf-8')
  98         return txt
  99
 100     def close(self):
 101         """Close snippet file"""
 102         if self.file:
 103             self.file.close()
 104
 105     def remove(self):
 106         self.revision = None
 107         try:
 108             os.unlink(self.path)
 109             self.revision = 0
 110             while True:
 111                 self.revision += 1
 112                 os.unlink(self.path)
 113         except OSError:
 114             pass
 115
 116
 117 class Index(SolrIndex):
 118     """
 119     Class indexing books.
 120     """
 121     def __init__(self):
 122         super(Index, self).__init__(mode='rw')
 123
 124     def delete_query(self, *queries):
 125         """
 126         index.delete(queries=...) doesn't work, so let's reimplement it
 127         using deletion of list of uids.
 128         """
 129         uids = set()
 130         for q in queries:
 131             if isinstance(q, sunburnt.search.LuceneQuery):
 132                 q = self.index.query(q)
 133             q.field_limiter.update(['uid'])
 134             st = 0
 135             rows = 100
 136             while True:
 137                 ids = q.paginate(start=st, rows=rows).execute()
 138                 if not len(ids):
 139                     break
 140                 for res in ids:
 141                     uids.add(res['uid'])
 142                 st += rows
 143         if uids:
 144             self.index.delete(uids)
 145             return True
 146         else:
 147             return False
 148
 149     def index_tags(self, *tags, **kw):
 150         """
 151         Re-index global tag list.
 152         Removes all tags from index, then index them again.
 153         Indexed fields include: id, name (with and without polish stems), category
 154         """
 155         log.debug("Indexing tags")
 156         remove_only = kw.get('remove_only', False)
 157         # first, remove tags from index.
 158         if tags:
 159             tag_qs = []
 160             for tag in tags:
 161                 q_id = self.index.Q(tag_id=tag.id)
 162
 163                 if isinstance(tag, PDCounterAuthor):
 164                     q_cat = self.index.Q(tag_category='pd_author')
 165                 elif isinstance(tag, PDCounterBook):
 166                     q_cat = self.index.Q(tag_category='pd_book')
 167                 else:
 168                     q_cat = self.index.Q(tag_category=tag.category)
 169
 170                 q_id_cat = self.index.Q(q_id & q_cat)
 171                 tag_qs.append(q_id_cat)
 172             self.delete_query(*tag_qs)
 173         else:  # all
 174             q = self.index.Q(tag_id__any=True)
 175             self.delete_query(q)
 176
 177         if not remove_only:
 178             # then add them [all or just one passed]
 179             if not tags:
 180                 tags = chain(
 181                     catalogue.models.Tag.objects.exclude(category='set'),
 182                     PDCounterAuthor.objects.all(),
 183                     PDCounterBook.objects.all())
 184
 185             for tag in tags:
 186                 if isinstance(tag, PDCounterAuthor):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.name,
 190                         "tag_name_pl": tag.name,
 191                         "tag_category": 'pd_author',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_a" % tag.id
 194                         }
 195                 elif isinstance(tag, PDCounterBook):
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.title,
 199                         "tag_name_pl": tag.title,
 200                         "tag_category": 'pd_book',
 201                         "is_pdcounter": True,
 202                         "uid": "tag%d_pd_b" % tag.id
 203                         }
 204                 else:
 205                     doc = {
 206                         "tag_id": int(tag.id),
 207                         "tag_name": tag.name,
 208                         "tag_name_pl": tag.name,
 209                         "tag_category": tag.category,
 210                         "is_pdcounter": False,
 211                         "uid": "tag%d" % tag.id
 212                         }
 213                 self.index.add(doc)
 214
 215     def create_book_doc(self, book):
 216         """
 217         Create a lucene document referring book id.
 218         """
 219         doc = {'book_id': int(book.id)}
 220         if book.parent is not None:
 221             doc['parent_id'] = int(book.parent.id)
 222         return doc
 223
 224     def remove_book(self, book_or_id, remove_snippets=True):
 225         """Removes a book from search index.
 226         book - Book instance."""
 227         if isinstance(book_or_id, catalogue.models.Book):
 228             book_id = book_or_id.id
 229         else:
 230             book_id = book_or_id
 231
 232         self.delete_query(self.index.Q(book_id=book_id))
 233
 234         if remove_snippets:
 235             snippets = Snippets(book_id)
 236             snippets.remove()
 237
 238     def index_book(self, book, book_info=None, overwrite=True):
 239         """
 240         Indexes the book.
 241         Creates a lucene document for extracted metadata
 242         and calls self.index_content() to index the contents of the book.
 243         """
 244         if overwrite:
 245             # we don't remove snippets, since they might be still needed by
 246             # threads using not reopened index
 247             self.remove_book(book, remove_snippets=False)
 248
 249         book_doc = self.create_book_doc(book)
 250         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 251             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 252         # let's not index it - it's only used for extracting publish date
 253         if 'source_name' in meta_fields:
 254             del meta_fields['source_name']
 255
 256         for n, f in meta_fields.items():
 257             book_doc[n] = f
 258
 259         book_doc['uid'] = "book%s" % book_doc['book_id']
 260         self.index.add(book_doc)
 261         del book_doc
 262         book_fields = {
 263             'title': meta_fields['title'],
 264             'authors': meta_fields['authors'],
 265             'published_date': meta_fields['published_date']
 266             }
 267
 268         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 269             if tag_name in meta_fields:
 270                 book_fields[tag_name] = meta_fields[tag_name]
 271
 272         self.index_content(book, book_fields=book_fields)
 273
 274     master_tags = [
 275         'opowiadanie',
 276         'powiesc',
 277         'dramat_wierszowany_l',
 278         'dramat_wierszowany_lp',
 279         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 280         'wywiad',
 281     ]
 282
 283     ignore_content_tags = [
 284         'uwaga', 'extra', 'nota_red', 'abstrakt',
 285         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 286         'didaskalia',
 287         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 288     ]
 289
 290     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 291
 292     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 293                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 294
 295     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 296
 297     def extract_metadata(self, book, book_info=None, dc_only=None):
 298         """
 299         Extract metadata from book and returns a map of fields keyed by fieldname
 300         """
 301         fields = {}
 302
 303         if book_info is None:
 304             book_info = dcparser.parse(open(book.xml_file.path))
 305
 306         fields['slug'] = book.slug
 307         fields['tags'] = [t.name for t in book.tags]
 308         fields['is_book'] = True
 309
 310         # validator, name
 311         for field in dcparser.BookInfo.FIELDS:
 312             if dc_only and field.name not in dc_only:
 313                 continue
 314             if hasattr(book_info, field.name):
 315                 if not getattr(book_info, field.name):
 316                     continue
 317                 # since no type information is available, we use validator
 318                 type_indicator = field.validator
 319                 if type_indicator == dcparser.as_unicode:
 320                     s = getattr(book_info, field.name)
 321                     if field.multiple:
 322                         s = ', '.join(s)
 323                     fields[field.name] = s
 324                 elif type_indicator == dcparser.as_person:
 325                     p = getattr(book_info, field.name)
 326                     if isinstance(p, dcparser.Person):
 327                         persons = unicode(p)
 328                     else:
 329                         persons = ', '.join(map(unicode, p))
 330                     fields[field.name] = persons
 331                 elif type_indicator == dcparser.as_date:
 332                     dt = getattr(book_info, field.name)
 333                     fields[field.name] = dt
 334
 335         # get published date
 336         pd = None
 337         if hasattr(book_info, 'source_name') and book_info.source_name:
 338             match = self.published_date_re.search(book_info.source_name)
 339             if match is not None:
 340                 pd = str(match.groups()[0])
 341         if not pd:
 342             pd = ""
 343         fields["published_date"] = pd
 344
 345         return fields
 346
 347     # def add_gaps(self, fields, fieldname):
 348     #     """
 349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 351     #     """
 352     #     def gap():
 353     #         while True:
 354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 356
 357     def get_master(self, root):
 358         """
 359         Returns the first master tag from an etree.
 360         """
 361         for master in root.iter():
 362             if master.tag in self.master_tags:
 363                 return master
 364
 365     def index_content(self, book, book_fields):
 366         """
 367         Walks the book XML and extract content from it.
 368         Adds parts for each header tag and for each fragment.
 369         """
 370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 371         root = wld.edoc.getroot()
 372
 373         master = self.get_master(root)
 374         if master is None:
 375             return []
 376
 377         def walker(node):
 378             if node.tag not in self.ignore_content_tags:
 379                 yield node, None, None
 380                 if node.text is not None:
 381                     yield None, node.text, None
 382                 for child in list(node):
 383                     for b, t, e in walker(child):
 384                         yield b, t, e
 385                 yield None, None, node
 386
 387             if node.tail is not None:
 388                 yield None, node.tail, None
 389             return
 390
 391         def fix_format(text):
 392             # separator = [u" ", u"\t", u".", u";", u","]
 393             if isinstance(text, list):
 394                 # need to join it first
 395                 text = filter(lambda s: s is not None, content)
 396                 text = u' '.join(text)
 397                 # for i in range(len(text)):
 398                 #     if i > 0:
 399                 #         if text[i][0] not in separator\
 400                 #             and text[i - 1][-1] not in separator:
 401                 #          text.insert(i, u" ")
 402
 403             return re.sub("(?m)/$", "", text)
 404
 405         def add_part(snippets, **fields):
 406             doc = self.create_book_doc(book)
 407             for n, v in book_fields.items():
 408                 doc[n] = v
 409
 410             doc['header_index'] = fields["header_index"]
 411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 412             doc['header_type'] = fields['header_type']
 413
 414             doc['text'] = fields['text']
 415
 416             # snippets
 417             snip_pos = snippets.add(fields["text"])
 418
 419             doc['snippets_position'] = snip_pos[0]
 420             doc['snippets_length'] = snip_pos[1]
 421             if snippets.revision:
 422                 doc["snippets_revision"] = snippets.revision
 423
 424             if 'fragment_anchor' in fields:
 425                 doc["fragment_anchor"] = fields['fragment_anchor']
 426
 427             if 'themes' in fields:
 428                 doc['themes'] = fields['themes']
 429             doc['uid'] = "part%s-%s-%s-%s" % (
 430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 431             return doc
 432
 433         fragments = {}
 434         snippets = Snippets(book.id).open('w')
 435         try:
 436             for header, position in zip(list(master), range(len(master))):
 437
 438                 if header.tag in self.skip_header_tags:
 439                     continue
 440                 if header.tag is etree.Comment:
 441                     continue
 442
 443                 # section content
 444                 content = []
 445                 footnote = []
 446
 447                 def all_content(text):
 448                     for frag in fragments.values():
 449                         frag['text'].append(text)
 450                     content.append(text)
 451                 handle_text = [all_content]
 452
 453                 for start, text, end in walker(header):
 454                     # handle footnotes
 455                     if start is not None and start.tag in self.footnote_tags:
 456                         footnote = []
 457
 458                         def collect_footnote(t):
 459                             footnote.append(t)
 460
 461                         handle_text.append(collect_footnote)
 462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 463                         handle_text.pop()
 464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 465                                        text=u''.join(footnote),
 466                                        is_footnote=True)
 467                         self.index.add(doc)
 468                         footnote = []
 469
 470                     # handle fragments and themes.
 471                     if start is not None and start.tag == 'begin':
 472                         fid = start.attrib['id'][1:]
 473                         fragments[fid] = {
 474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(lambda text: None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if not frag['themes']:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        text=fix_format(frag['text']),
 500                                        themes=frag['themes'])
 501                         self.index.add(doc)
 502
 503                         # Collect content.
 504
 505                     if text is not None and handle_text is not []:
 506                         hdl = handle_text[-1]
 507                         hdl(text)
 508
 509                         # in the end, add a section text.
 510                 doc = add_part(snippets, header_index=position,
 511                                header_type=header.tag, text=fix_format(content))
 512
 513                 self.index.add(doc)
 514
 515         finally:
 516             snippets.close()
 517
 518
 519 class SearchResult(object):
 520     def __init__(self, doc, how_found=None, query_terms=None):
 521         self.boost = 1.0
 522         self._hits = []
 523         self._processed_hits = None  # processed hits
 524         self.snippets = []
 525         self.query_terms = query_terms
 526         self._book = None
 527
 528         if 'score' in doc:
 529             self._score = doc['score']
 530         else:
 531             self._score = 0
 532
 533         self.book_id = int(doc["book_id"])
 534
 535         try:
 536             self.published_date = int(doc.get("published_date"))
 537         except ValueError:
 538             self.published_date = 0
 539
 540         # content hits
 541         header_type = doc.get("header_type", None)
 542         # we have a content hit in some header of fragment
 543         if header_type is not None:
 544             sec = (header_type, int(doc["header_index"]))
 545             header_span = doc['header_span']
 546             header_span = header_span is not None and int(header_span) or 1
 547             fragment = doc.get("fragment_anchor", None)
 548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 549             snippets_rev = doc.get('snippets_revision', None)
 550
 551             hit = (sec + (header_span,), fragment, self._score, {
 552                 'how_found': how_found,
 553                 'snippets_pos': snippets_pos,
 554                 'snippets_revision': snippets_rev,
 555                 'themes': doc.get('themes', []),
 556                 'themes_pl': doc.get('themes_pl', [])
 557                 })
 558
 559             self._hits.append(hit)
 560
 561     @classmethod
 562     def from_book(cls, book, how_found=None, query_terms=None):
 563         doc = {
 564             'score': book.popularity.count,
 565             'book_id': book.id,
 566             'published_date': 0,
 567         }
 568         result = cls(doc, how_found=how_found, query_terms=query_terms)
 569         result._book = book
 570         return result
 571
 572     def __unicode__(self):
 573         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 574             (self.book_id, len(self._hits),
 575              len(self._processed_hits) if self._processed_hits else -1,
 576              self._score, len(self.snippets))
 577
 578     def __str__(self):
 579         return unicode(self).encode('utf-8')
 580
 581     @property
 582     def score(self):
 583         return self._score * self.boost
 584
 585     def merge(self, other):
 586         if self.book_id != other.book_id:
 587             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 588         self._hits += other._hits
 589         self._score += max(other._score, 0)
 590         return self
 591
 592     def get_book(self):
 593         if self._book is not None:
 594             return self._book
 595         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 596         return self._book
 597
 598     book = property(get_book)
 599
 600     POSITION = 0
 601     FRAGMENT = 1
 602     POSITION_INDEX = 1
 603     POSITION_SPAN = 2
 604     SCORE = 2
 605     OTHER = 3
 606
 607     @property
 608     def hits(self):
 609         if self._processed_hits is not None:
 610             return self._processed_hits
 611
 612         # to sections and fragments
 613         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 614
 615         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 616
 617         # sections not covered by fragments
 618         sect = filter(lambda s: 0 == len(filter(
 619             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 620                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 621
 622         def remove_duplicates(lst, keyfn, compare):
 623             els = {}
 624             for e in lst:
 625                 eif = keyfn(e)
 626                 if eif in els:
 627                     if compare(els[eif], e) >= 1:
 628                         continue
 629                 els[eif] = e
 630             return els.values()
 631
 632         # remove fragments with duplicated fid's and duplicated snippets
 633         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 634         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 635         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 636
 637         # remove duplicate sections
 638         sections = {}
 639
 640         for s in sect:
 641             si = s[self.POSITION][self.POSITION_INDEX]
 642             # skip existing
 643             if si in sections:
 644                 if sections[si]['score'] >= s[self.SCORE]:
 645                     continue
 646
 647             m = {'score': s[self.SCORE],
 648                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 649                  }
 650             m.update(s[self.OTHER])
 651             sections[si] = m
 652
 653         hits = sections.values()
 654
 655         for f in frags:
 656             try:
 657                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 658             except catalogue.models.Fragment.DoesNotExist:
 659                 # stale index
 660                 continue
 661             # Figure out if we were searching for a token matching some word in theme name.
 662             themes = frag.tags.filter(category='theme')
 663             themes_hit = set()
 664             if self.query_terms is not None:
 665                 for i in range(0, len(f[self.OTHER]['themes'])):
 666                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 667                     tms = map(unicode.lower, tms)
 668                     for qt in self.query_terms:
 669                         if qt in tms:
 670                             themes_hit.add(f[self.OTHER]['themes'][i])
 671                             break
 672
 673             def theme_by_name(n):
 674                 th = filter(lambda t: t.name == n, themes)
 675                 if th:
 676                     return th[0]
 677                 else:
 678                     return None
 679             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 680
 681             m = {'score': f[self.SCORE],
 682                  'fragment': frag,
 683                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 684                  'themes': themes,
 685                  'themes_hit': themes_hit
 686                  }
 687             m.update(f[self.OTHER])
 688             hits.append(m)
 689
 690         hits.sort(key=lambda h: h['score'], reverse=True)
 691
 692         self._processed_hits = hits
 693
 694         return hits
 695
 696     @staticmethod
 697     def aggregate(*result_lists):
 698         books = {}
 699         for rl in result_lists:
 700             for r in rl:
 701                 if r.book_id in books:
 702                     books[r.book_id].merge(r)
 703                 else:
 704                     books[r.book_id] = r
 705         return books.values()
 706
 707     def __cmp__(self, other):
 708         c = cmp(self.score, other.score)
 709         if c == 0:
 710             # this is inverted, because earlier date is better
 711             return cmp(other.published_date, self.published_date)
 712         else:
 713             return c
 714
 715     def __len__(self):
 716         return len(self.hits)
 717
 718     def snippet_pos(self, idx=0):
 719         return self.hits[idx]['snippets_pos']
 720
 721     def snippet_revision(self, idx=0):
 722         try:
 723             return self.hits[idx]['snippets_revision']
 724         except (IndexError, KeyError):
 725             return None
 726
 727
 728 class Search(SolrIndex):
 729     """
 730     Search facilities.
 731     """
 732     def __init__(self, default_field="text"):
 733         super(Search, self).__init__(mode='r')
 734
 735     def make_term_query(self, query, field='text', modal=operator.or_):
 736         """
 737         Returns term queries joined by boolean query.
 738         modal - applies to boolean query
 739         fuzzy - should the query by fuzzy.
 740         """
 741         if query is None:
 742             query = ''
 743         q = self.index.Q()
 744         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 745
 746         return q
 747
 748     def search_by_author(self, words):
 749         from catalogue.models import Book
 750         books = Book.objects.filter(parent=None).order_by('-popularity__count')
 751         for word in words:
 752             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 753         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 754
 755     def search_words(self, words, fields, book=True):
 756         if book and fields == ['authors']:
 757             return self.search_by_author(words)
 758         filters = []
 759         for word in words:
 760             if book or (word not in stopwords):
 761                 word_filter = None
 762                 for field in fields:
 763                     q = self.index.Q(**{field: word})
 764                     if word_filter is None:
 765                         word_filter = q
 766                     else:
 767                         word_filter |= q
 768                 filters.append(word_filter)
 769         if not filters:
 770             return []
 771         if book:
 772             query = self.index.query(is_book=True)
 773         else:
 774             query = self.index.query()
 775         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 776         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
 777
 778     def get_snippets(self, searchresult, query, field='text', num=1):
 779         """
 780         Returns a snippet for found scoreDoc.
 781         """
 782         maxnum = len(searchresult)
 783         if num is None or num < 0 or num > maxnum:
 784             num = maxnum
 785         book_id = searchresult.book_id
 786         revision = searchresult.snippet_revision()
 787         snippets = Snippets(book_id, revision=revision)
 788         snips = [None] * maxnum
 789         try:
 790             snippets.open()
 791             idx = 0
 792             while idx < maxnum and num > 0:
 793                 position, length = searchresult.snippet_pos(idx)
 794                 if position is None or length is None:
 795                     continue
 796                 text = snippets.get((int(position),
 797                                      int(length)))
 798                 snip = self.index.highlight(text=text, field=field, q=query)
 799                 if snip not in snips:
 800                     snips[idx] = snip
 801                     if snip:
 802                         num -= 1
 803                 idx += 1
 804
 805         except IOError, e:
 806             book = catalogue.models.Book.objects.filter(id=book_id)
 807             if not book:
 808                 log.error("Book does not exist for book id = %d" % book_id)
 809             elif not book.get().children.exists():
 810                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 811             return []
 812         finally:
 813             snippets.close()
 814
 815             # remove verse end markers..
 816         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 817
 818         searchresult.snippets = snips
 819
 820         return snips
 821
 822     @staticmethod
 823     def apply_filters(query, filters):
 824         """
 825         Apply filters to a query
 826         """
 827         if filters is None:
 828             filters = []
 829         filters = filter(lambda x: x is not None, filters)
 830         for f in filters:
 831             query = query.query(f)
 832         return query
 833
 834
 835 if getattr(settings, 'SEARCH_MOCK', False):
 836     from .mock_search import Search