src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23
  24 class SolrIndex(object):
  25     def __init__(self, mode=None):
  26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  27
  28
  29 class Snippets(object):
  30     """
  31     This class manages snippet files for indexed object (book)
  32     the snippets are concatenated together, and their positions and
  33     lengths are kept in lucene index fields.
  34     """
  35     SNIPPET_DIR = "snippets"
  36
  37     def __init__(self, book_id, revision=None):
  38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         self.book_id = book_id
  40         self.revision = revision
  41         self.file = None
  42         self.position = None
  43
  44     @property
  45     def path(self):
  46         if self.revision:
  47             fn = "%d.%d" % (self.book_id, self.revision)
  48         else:
  49             fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if 'b' not in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         if self.file:
  96             self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         log.debug("Indexing tags")
 149         remove_only = kw.get('remove_only', False)
 150         # first, remove tags from index.
 151         if tags:
 152             tag_qs = []
 153             for tag in tags:
 154                 q_id = self.index.Q(tag_id=tag.id)
 155
 156                 if isinstance(tag, PDCounterAuthor):
 157                     q_cat = self.index.Q(tag_category='pd_author')
 158                 elif isinstance(tag, PDCounterBook):
 159                     q_cat = self.index.Q(tag_category='pd_book')
 160                 else:
 161                     q_cat = self.index.Q(tag_category=tag.category)
 162
 163                 q_id_cat = self.index.Q(q_id & q_cat)
 164                 tag_qs.append(q_id_cat)
 165             self.delete_query(*tag_qs)
 166         else:  # all
 167             q = self.index.Q(tag_id__any=True)
 168             self.delete_query(q)
 169
 170         if not remove_only:
 171             # then add them [all or just one passed]
 172             if not tags:
 173                 tags = chain(
 174                     catalogue.models.Tag.objects.exclude(category='set'),
 175                     PDCounterAuthor.objects.all(),
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {'book_id': int(book.id)}
 213         if book.parent is not None:
 214             doc['parent_id'] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 244             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         self.index.add(book_doc)
 254         del book_doc
 255         book_fields = {
 256             'title': meta_fields['title'],
 257             'authors': meta_fields['authors'],
 258             'published_date': meta_fields['published_date']
 259             }
 260
 261         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 262             if tag_name in meta_fields:
 263                 book_fields[tag_name] = meta_fields[tag_name]
 264
 265         self.index_content(book, book_fields=book_fields)
 266
 267     master_tags = [
 268         'opowiadanie',
 269         'powiesc',
 270         'dramat_wierszowany_l',
 271         'dramat_wierszowany_lp',
 272         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 273         'wywiad',
 274         ]
 275
 276     ignore_content_tags = [
 277         'uwaga', 'extra', 'nota_red',
 278         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 279         'didaskalia',
 280         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 281         ]
 282
 283     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 284
 285     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 286                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 287
 288     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 289
 290     def extract_metadata(self, book, book_info=None, dc_only=None):
 291         """
 292         Extract metadata from book and returns a map of fields keyed by fieldname
 293         """
 294         fields = {}
 295
 296         if book_info is None:
 297             book_info = dcparser.parse(open(book.xml_file.path))
 298
 299         fields['slug'] = book.slug
 300         fields['tags'] = [t.name for t in book.tags]
 301         fields['is_book'] = True
 302
 303         # validator, name
 304         for field in dcparser.BookInfo.FIELDS:
 305             if dc_only and field.name not in dc_only:
 306                 continue
 307             if hasattr(book_info, field.name):
 308                 if not getattr(book_info, field.name):
 309                     continue
 310                 # since no type information is available, we use validator
 311                 type_indicator = field.validator
 312                 if type_indicator == dcparser.as_unicode:
 313                     s = getattr(book_info, field.name)
 314                     if field.multiple:
 315                         s = ', '.join(s)
 316                     fields[field.name] = s
 317                 elif type_indicator == dcparser.as_person:
 318                     p = getattr(book_info, field.name)
 319                     if isinstance(p, dcparser.Person):
 320                         persons = unicode(p)
 321                     else:
 322                         persons = ', '.join(map(unicode, p))
 323                     fields[field.name] = persons
 324                 elif type_indicator == dcparser.as_date:
 325                     dt = getattr(book_info, field.name)
 326                     fields[field.name] = dt
 327
 328         # get published date
 329         pd = None
 330         if hasattr(book_info, 'source_name') and book_info.source_name:
 331             match = self.published_date_re.search(book_info.source_name)
 332             if match is not None:
 333                 pd = str(match.groups()[0])
 334         if not pd:
 335             pd = ""
 336         fields["published_date"] = pd
 337
 338         return fields
 339
 340     # def add_gaps(self, fields, fieldname):
 341     #     """
 342     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 343     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 344     #     """
 345     #     def gap():
 346     #         while True:
 347     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 348     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 349
 350     def get_master(self, root):
 351         """
 352         Returns the first master tag from an etree.
 353         """
 354         for master in root.iter():
 355             if master.tag in self.master_tags:
 356                 return master
 357
 358     def index_content(self, book, book_fields):
 359         """
 360         Walks the book XML and extract content from it.
 361         Adds parts for each header tag and for each fragment.
 362         """
 363         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 364         root = wld.edoc.getroot()
 365
 366         master = self.get_master(root)
 367         if master is None:
 368             return []
 369
 370         def walker(node):
 371             if node.tag not in self.ignore_content_tags:
 372                 yield node, None, None
 373                 if node.text is not None:
 374                     yield None, node.text, None
 375                 for child in list(node):
 376                     for b, t, e in walker(child):
 377                         yield b, t, e
 378                 yield None, None, node
 379
 380             if node.tail is not None:
 381                 yield None, node.tail, None
 382             return
 383
 384         def fix_format(text):
 385             # separator = [u" ", u"\t", u".", u";", u","]
 386             if isinstance(text, list):
 387                 # need to join it first
 388                 text = filter(lambda s: s is not None, content)
 389                 text = u' '.join(text)
 390                 # for i in range(len(text)):
 391                 #     if i > 0:
 392                 #         if text[i][0] not in separator\
 393                 #             and text[i - 1][-1] not in separator:
 394                 #          text.insert(i, u" ")
 395
 396             return re.sub("(?m)/$", "", text)
 397
 398         def add_part(snippets, **fields):
 399             doc = self.create_book_doc(book)
 400             for n, v in book_fields.items():
 401                 doc[n] = v
 402
 403             doc['header_index'] = fields["header_index"]
 404             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 405             doc['header_type'] = fields['header_type']
 406
 407             doc['text'] = fields['text']
 408
 409             # snippets
 410             snip_pos = snippets.add(fields["text"])
 411
 412             doc['snippets_position'] = snip_pos[0]
 413             doc['snippets_length'] = snip_pos[1]
 414             if snippets.revision:
 415                 doc["snippets_revision"] = snippets.revision
 416
 417             if 'fragment_anchor' in fields:
 418                 doc["fragment_anchor"] = fields['fragment_anchor']
 419
 420             if 'themes' in fields:
 421                 doc['themes'] = fields['themes']
 422             doc['uid'] = "part%s-%s-%s-%s" % (
 423                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 424             return doc
 425
 426         fragments = {}
 427         snippets = Snippets(book.id).open('w')
 428         try:
 429             for header, position in zip(list(master), range(len(master))):
 430
 431                 if header.tag in self.skip_header_tags:
 432                     continue
 433                 if header.tag is etree.Comment:
 434                     continue
 435
 436                 # section content
 437                 content = []
 438                 footnote = []
 439
 440                 def all_content(text):
 441                     for frag in fragments.values():
 442                         frag['text'].append(text)
 443                     content.append(text)
 444                 handle_text = [all_content]
 445
 446                 for start, text, end in walker(header):
 447                     # handle footnotes
 448                     if start is not None and start.tag in self.footnote_tags:
 449                         footnote = []
 450
 451                         def collect_footnote(t):
 452                             footnote.append(t)
 453
 454                         handle_text.append(collect_footnote)
 455                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 456                         handle_text.pop()
 457                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 458                                        text=u''.join(footnote),
 459                                        is_footnote=True)
 460                         self.index.add(doc)
 461                         footnote = []
 462
 463                     # handle fragments and themes.
 464                     if start is not None and start.tag == 'begin':
 465                         fid = start.attrib['id'][1:]
 466                         fragments[fid] = {
 467                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 468
 469                     # themes for this fragment
 470                     elif start is not None and start.tag == 'motyw':
 471                         fid = start.attrib['id'][1:]
 472                         handle_text.append(lambda text: None)
 473                         if start.text is not None:
 474                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 475                     elif end is not None and end.tag == 'motyw':
 476                         handle_text.pop()
 477
 478                     elif start is not None and start.tag == 'end':
 479                         fid = start.attrib['id'][1:]
 480                         if fid not in fragments:
 481                             continue  # a broken <end> node, skip it
 482                         frag = fragments[fid]
 483                         if not frag['themes']:
 484                             continue  # empty themes list.
 485                         del fragments[fid]
 486
 487                         doc = add_part(snippets,
 488                                        header_type=frag['start_header'],
 489                                        header_index=frag['start_section'],
 490                                        header_span=position - frag['start_section'] + 1,
 491                                        fragment_anchor=fid,
 492                                        text=fix_format(frag['text']),
 493                                        themes=frag['themes'])
 494                         self.index.add(doc)
 495
 496                         # Collect content.
 497
 498                     if text is not None and handle_text is not []:
 499                         hdl = handle_text[-1]
 500                         hdl(text)
 501
 502                         # in the end, add a section text.
 503                 doc = add_part(snippets, header_index=position,
 504                                header_type=header.tag, text=fix_format(content))
 505
 506                 self.index.add(doc)
 507
 508         finally:
 509             snippets.close()
 510
 511
 512 class SearchResult(object):
 513     def __init__(self, doc, how_found=None, query_terms=None):
 514         self.boost = 1.0
 515         self._hits = []
 516         self._processed_hits = None  # processed hits
 517         self.snippets = []
 518         self.query_terms = query_terms
 519         self._book = None
 520
 521         if 'score' in doc:
 522             self._score = doc['score']
 523         else:
 524             self._score = 0
 525
 526         self.book_id = int(doc["book_id"])
 527
 528         try:
 529             self.published_date = int(doc.get("published_date"))
 530         except ValueError:
 531             self.published_date = 0
 532
 533         # content hits
 534         header_type = doc.get("header_type", None)
 535         # we have a content hit in some header of fragment
 536         if header_type is not None:
 537             sec = (header_type, int(doc["header_index"]))
 538             header_span = doc['header_span']
 539             header_span = header_span is not None and int(header_span) or 1
 540             fragment = doc.get("fragment_anchor", None)
 541             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 542             snippets_rev = doc.get('snippets_revision', None)
 543
 544             hit = (sec + (header_span,), fragment, self._score, {
 545                 'how_found': how_found,
 546                 'snippets_pos': snippets_pos,
 547                 'snippets_revision': snippets_rev,
 548                 'themes': doc.get('themes', []),
 549                 'themes_pl': doc.get('themes_pl', [])
 550                 })
 551
 552             self._hits.append(hit)
 553
 554     def __unicode__(self):
 555         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 556             (self.book_id, len(self._hits),
 557              len(self._processed_hits) if self._processed_hits else -1,
 558              self._score, len(self.snippets))
 559
 560     def __str__(self):
 561         return unicode(self).encode('utf-8')
 562
 563     @property
 564     def score(self):
 565         return self._score * self.boost
 566
 567     def merge(self, other):
 568         if self.book_id != other.book_id:
 569             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 570         self._hits += other._hits
 571         if other.score > self.score:
 572             self._score = other._score
 573         return self
 574
 575     def get_book(self):
 576         if self._book is not None:
 577             return self._book
 578         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 579         return self._book
 580
 581     book = property(get_book)
 582
 583     POSITION = 0
 584     FRAGMENT = 1
 585     POSITION_INDEX = 1
 586     POSITION_SPAN = 2
 587     SCORE = 2
 588     OTHER = 3
 589
 590     @property
 591     def hits(self):
 592         if self._processed_hits is not None:
 593             return self._processed_hits
 594
 595         # to sections and fragments
 596         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 597
 598         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 599
 600         # sections not covered by fragments
 601         sect = filter(lambda s: 0 == len(filter(
 602             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 603                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 604
 605         def remove_duplicates(lst, keyfn, compare):
 606             els = {}
 607             for e in lst:
 608                 eif = keyfn(e)
 609                 if eif in els:
 610                     if compare(els[eif], e) >= 1:
 611                         continue
 612                 els[eif] = e
 613             return els.values()
 614
 615         # remove fragments with duplicated fid's and duplicated snippets
 616         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 617         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 618         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 619
 620         # remove duplicate sections
 621         sections = {}
 622
 623         for s in sect:
 624             si = s[self.POSITION][self.POSITION_INDEX]
 625             # skip existing
 626             if si in sections:
 627                 if sections[si]['score'] >= s[self.SCORE]:
 628                     continue
 629
 630             m = {'score': s[self.SCORE],
 631                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 632                  }
 633             m.update(s[self.OTHER])
 634             sections[si] = m
 635
 636         hits = sections.values()
 637
 638         for f in frags:
 639             try:
 640                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 641             except catalogue.models.Fragment.DoesNotExist:
 642                 # stale index
 643                 continue
 644             # Figure out if we were searching for a token matching some word in theme name.
 645             themes = frag.tags.filter(category='theme')
 646             themes_hit = set()
 647             if self.query_terms is not None:
 648                 for i in range(0, len(f[self.OTHER]['themes'])):
 649                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 650                     tms = map(unicode.lower, tms)
 651                     for qt in self.query_terms:
 652                         if qt in tms:
 653                             themes_hit.add(f[self.OTHER]['themes'][i])
 654                             break
 655
 656             def theme_by_name(n):
 657                 th = filter(lambda t: t.name == n, themes)
 658                 if th:
 659                     return th[0]
 660                 else:
 661                     return None
 662             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 663
 664             m = {'score': f[self.SCORE],
 665                  'fragment': frag,
 666                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 667                  'themes': themes,
 668                  'themes_hit': themes_hit
 669                  }
 670             m.update(f[self.OTHER])
 671             hits.append(m)
 672
 673         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 674
 675         self._processed_hits = hits
 676
 677         return hits
 678
 679     @staticmethod
 680     def aggregate(*result_lists):
 681         books = {}
 682         for rl in result_lists:
 683             for r in rl:
 684                 if r.book_id in books:
 685                     books[r.book_id].merge(r)
 686                 else:
 687                     books[r.book_id] = r
 688         return books.values()
 689
 690     def __cmp__(self, other):
 691         c = cmp(self.score, other.score)
 692         if c == 0:
 693             # this is inverted, because earlier date is better
 694             return cmp(other.published_date, self.published_date)
 695         else:
 696             return c
 697
 698     def __len__(self):
 699         return len(self.hits)
 700
 701     def snippet_pos(self, idx=0):
 702         return self.hits[idx]['snippets_pos']
 703
 704     def snippet_revision(self, idx=0):
 705         try:
 706             return self.hits[idx]['snippets_revision']
 707         except (IndexError, KeyError):
 708             return None
 709
 710
 711 class Search(SolrIndex):
 712     """
 713     Search facilities.
 714     """
 715     def __init__(self, default_field="text"):
 716         super(Search, self).__init__(mode='r')
 717
 718     def make_term_query(self, query, field='text', modal=operator.or_):
 719         """
 720         Returns term queries joined by boolean query.
 721         modal - applies to boolean query
 722         fuzzy - should the query by fuzzy.
 723         """
 724         if query is None:
 725             query = ''
 726         q = self.index.Q()
 727         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 728
 729         return q
 730
 731     def search_words(self, words, fields, book=True):
 732         filters = []
 733         for word in words:
 734             word_filter = None
 735             for field in fields:
 736                 q = self.index.Q(**{field: word})
 737                 if word_filter is None:
 738                     word_filter = q
 739                 else:
 740                     word_filter |= q
 741             filters.append(word_filter)
 742         if book:
 743             query = self.index.query(is_book=True)
 744         else:
 745             query = self.index.query()
 746         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 747         return [SearchResult(found, how_found='search_words', query_terms=words) for found in query.execute()]
 748
 749     def get_snippets(self, searchresult, query, field='text', num=1):
 750         """
 751         Returns a snippet for found scoreDoc.
 752         """
 753         maxnum = len(searchresult)
 754         if num is None or num < 0 or num > maxnum:
 755             num = maxnum
 756         book_id = searchresult.book_id
 757         revision = searchresult.snippet_revision()
 758         snippets = Snippets(book_id, revision=revision)
 759         snips = [None] * maxnum
 760         try:
 761             snippets.open()
 762             idx = 0
 763             while idx < maxnum and num > 0:
 764                 position, length = searchresult.snippet_pos(idx)
 765                 if position is None or length is None:
 766                     continue
 767                 text = snippets.get((int(position),
 768                                      int(length)))
 769                 snip = self.index.highlight(text=text, field=field, q=query)
 770                 if snip not in snips:
 771                     snips[idx] = snip
 772                     if snip:
 773                         num -= 1
 774                 idx += 1
 775
 776         except IOError, e:
 777             book = catalogue.models.Book.objects.filter(id=book_id)
 778             if not book:
 779                 log.error("Book does not exist for book id = %d" % book_id)
 780             elif not book.get().children.exists():
 781                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 782             return []
 783         finally:
 784             snippets.close()
 785
 786             # remove verse end markers..
 787         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 788
 789         searchresult.snippets = snips
 790
 791         return snips
 792
 793     def hint_tags(self, query, pdcounter=True, prefix=True):
 794         """
 795         Return auto-complete hints for tags
 796         using prefix search.
 797         """
 798         q = self.index.Q()
 799         query = query.strip()
 800         for field in ['tag_name', 'tag_name_pl']:
 801             if prefix:
 802                 q |= self.index.Q(**{field: query + "*"})
 803             else:
 804                 q |= self.make_term_query(query, field=field)
 805         qu = self.index.query(q)
 806
 807         return self.search_tags(qu, pdcounter=pdcounter)
 808
 809     def search_tags(self, query, filters=None, pdcounter=False):
 810         """
 811         Search for Tag objects using query.
 812         """
 813         if not filters:
 814             filters = []
 815         if not pdcounter:
 816             filters.append(~self.index.Q(is_pdcounter=True))
 817         res = self.apply_filters(query, filters).execute()
 818
 819         tags = []
 820         pd_tags = []
 821
 822         for doc in res:
 823             is_pdcounter = doc.get('is_pdcounter', False)
 824             category = doc.get('tag_category')
 825             try:
 826                 if is_pdcounter:
 827                     if category == 'pd_author':
 828                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 829                     else:  # category == 'pd_book':
 830                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 831                         tag.category = 'pd_book'  # make it look more lik a tag.
 832                     pd_tags.append(tag)
 833                 else:
 834                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 835                     tags.append(tag)
 836
 837             except catalogue.models.Tag.DoesNotExist:
 838                 pass
 839             except PDCounterAuthor.DoesNotExist:
 840                 pass
 841             except PDCounterBook.DoesNotExist:
 842                 pass
 843
 844         tags_slugs = set(map(lambda t: t.slug, tags))
 845         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
 846
 847         log.debug('search_tags: %s' % tags)
 848
 849         return tags
 850
 851     def hint_books(self, query, prefix=True):
 852         """
 853         Returns auto-complete hints for book titles
 854         Because we do not index 'pseudo' title-tags.
 855         Prefix search.
 856         """
 857         q = self.index.Q()
 858         query = query.strip()
 859         if prefix:
 860             q |= self.index.Q(title=query + "*")
 861             q |= self.index.Q(title_orig=query + "*")
 862         else:
 863             q |= self.make_term_query(query, field='title')
 864             q |= self.make_term_query(query, field='title_orig')
 865         qu = self.index.query(q)
 866         only_books = self.index.Q(is_book=True)
 867         return self.search_books(qu, [only_books])
 868
 869     def search_books(self, query, filters=None, max_results=10):
 870         """
 871         Searches for Book objects using query
 872         """
 873         bks = []
 874         bks_found = set()
 875         query = query.query(is_book=True)
 876         res = self.apply_filters(query, filters).field_limit(['book_id'])
 877         for r in res:
 878             try:
 879                 bid = r['book_id']
 880                 if bid not in bks_found:
 881                     bks.append(catalogue.models.Book.objects.get(id=bid))
 882                     bks_found.add(bid)
 883             except catalogue.models.Book.DoesNotExist:
 884                 pass
 885         return bks
 886
 887     @staticmethod
 888     def apply_filters(query, filters):
 889         """
 890         Apply filters to a query
 891         """
 892         if filters is None:
 893             filters = []
 894         filters = filter(lambda x: x is not None, filters)
 895         for f in filters:
 896             query = query.query(f)
 897         return query
 898
 899
 900 if getattr(settings, 'SEARCH_MOCK', False):
 901     from .mock_search import Search