src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23
  24 class SolrIndex(object):
  25     def __init__(self, mode=None):
  26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  27
  28
  29 class Snippets(object):
  30     """
  31     This class manages snippet files for indexed object (book)
  32     the snippets are concatenated together, and their positions and
  33     lengths are kept in lucene index fields.
  34     """
  35     SNIPPET_DIR = "snippets"
  36
  37     def __init__(self, book_id, revision=None):
  38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         self.book_id = book_id
  40         self.revision = revision
  41         self.file = None
  42         self.position = None
  43
  44     @property
  45     def path(self):
  46         if self.revision:
  47             fn = "%d.%d" % (self.book_id, self.revision)
  48         else:
  49             fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if 'b' not in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         if self.file:
  96             self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     # WTF
 143     def index_tags(self, *tags, **kw):
 144         """
 145         Re-index global tag list.
 146         Removes all tags from index, then index them again.
 147         Indexed fields include: id, name (with and without polish stems), category
 148         """
 149         log.debug("Indexing tags")
 150         remove_only = kw.get('remove_only', False)
 151         # first, remove tags from index.
 152         if tags:
 153             tag_qs = []
 154             for tag in tags:
 155                 q_id = self.index.Q(tag_id=tag.id)
 156
 157                 if isinstance(tag, PDCounterAuthor):
 158                     q_cat = self.index.Q(tag_category='pd_author')
 159                 elif isinstance(tag, PDCounterBook):
 160                     q_cat = self.index.Q(tag_category='pd_book')
 161                 else:
 162                     q_cat = self.index.Q(tag_category=tag.category)
 163
 164                 q_id_cat = self.index.Q(q_id & q_cat)
 165                 tag_qs.append(q_id_cat)
 166             self.delete_query(*tag_qs)
 167         else:  # all
 168             q = self.index.Q(tag_id__any=True)
 169             self.delete_query(q)
 170
 171         if not remove_only:
 172             # then add them [all or just one passed]
 173             if not tags:
 174                 tags = chain(
 175                     catalogue.models.Tag.objects.exclude(category='set'),
 176                     PDCounterAuthor.objects.all(),
 177                     PDCounterBook.objects.all())
 178
 179             for tag in tags:
 180                 if isinstance(tag, PDCounterAuthor):
 181                     doc = {
 182                         "tag_id": int(tag.id),
 183                         "tag_name": tag.name,
 184                         "tag_name_pl": tag.name,
 185                         "tag_category": 'pd_author',
 186                         "is_pdcounter": True,
 187                         "uid": "tag%d_pd_a" % tag.id
 188                         }
 189                 elif isinstance(tag, PDCounterBook):
 190                     doc = {
 191                         "tag_id": int(tag.id),
 192                         "tag_name": tag.title,
 193                         "tag_name_pl": tag.title,
 194                         "tag_category": 'pd_book',
 195                         "is_pdcounter": True,
 196                         "uid": "tag%d_pd_b" % tag.id
 197                         }
 198                 else:
 199                     doc = {
 200                         "tag_id": int(tag.id),
 201                         "tag_name": tag.name,
 202                         "tag_name_pl": tag.name,
 203                         "tag_category": tag.category,
 204                         "is_pdcounter": False,
 205                         "uid": "tag%d" % tag.id
 206                         }
 207                 self.index.add(doc)
 208
 209     def create_book_doc(self, book):
 210         """
 211         Create a lucene document referring book id.
 212         """
 213         doc = {'book_id': int(book.id)}
 214         if book.parent is not None:
 215             doc['parent_id'] = int(book.parent.id)
 216         return doc
 217
 218     def remove_book(self, book_or_id, remove_snippets=True):
 219         """Removes a book from search index.
 220         book - Book instance."""
 221         if isinstance(book_or_id, catalogue.models.Book):
 222             book_id = book_or_id.id
 223         else:
 224             book_id = book_or_id
 225
 226         self.delete_query(self.index.Q(book_id=book_id))
 227
 228         if remove_snippets:
 229             snippets = Snippets(book_id)
 230             snippets.remove()
 231
 232     def index_book(self, book, book_info=None, overwrite=True):
 233         """
 234         Indexes the book.
 235         Creates a lucene document for extracted metadata
 236         and calls self.index_content() to index the contents of the book.
 237         """
 238         if overwrite:
 239             # we don't remove snippets, since they might be still needed by
 240             # threads using not reopened index
 241             self.remove_book(book, remove_snippets=False)
 242
 243         book_doc = self.create_book_doc(book)
 244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         self.index.add(book_doc)
 254         del book_doc
 255         book_fields = {
 256             'title': meta_fields['title'],
 257             'authors': meta_fields['authors'],
 258             'published_date': meta_fields['published_date']
 259             }
 260
 261         if 'translators' in meta_fields:
 262             book_fields['translators'] = meta_fields['translators']
 263
 264         self.index_content(book, book_fields=book_fields)
 265
 266     master_tags = [
 267         'opowiadanie',
 268         'powiesc',
 269         'dramat_wierszowany_l',
 270         'dramat_wierszowany_lp',
 271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 272         'wywiad',
 273         ]
 274
 275     ignore_content_tags = [
 276         'uwaga', 'extra',
 277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 278         'didaskalia',
 279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 280         ]
 281
 282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 283
 284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 285                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 286
 287     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 288
 289     def extract_metadata(self, book, book_info=None, dc_only=None):
 290         """
 291         Extract metadata from book and returns a map of fields keyed by fieldname
 292         """
 293         fields = {}
 294
 295         if book_info is None:
 296             book_info = dcparser.parse(open(book.xml_file.path))
 297
 298         fields['slug'] = book.slug
 299         fields['tags'] = [t.name for t in book.tags]
 300         fields['is_book'] = True
 301
 302         # validator, name
 303         for field in dcparser.BookInfo.FIELDS:
 304             if dc_only and field.name not in dc_only:
 305                 continue
 306             if hasattr(book_info, field.name):
 307                 if not getattr(book_info, field.name):
 308                     continue
 309                 # since no type information is available, we use validator
 310                 type_indicator = field.validator
 311                 if type_indicator == dcparser.as_unicode:
 312                     s = getattr(book_info, field.name)
 313                     if field.multiple:
 314                         s = ', '.join(s)
 315                     fields[field.name] = s
 316                 elif type_indicator == dcparser.as_person:
 317                     p = getattr(book_info, field.name)
 318                     if isinstance(p, dcparser.Person):
 319                         persons = unicode(p)
 320                     else:
 321                         persons = ', '.join(map(unicode, p))
 322                     fields[field.name] = persons
 323                 elif type_indicator == dcparser.as_date:
 324                     dt = getattr(book_info, field.name)
 325                     fields[field.name] = dt
 326
 327         # get published date
 328         pd = None
 329         if hasattr(book_info, 'source_name') and book_info.source_name:
 330             match = self.published_date_re.search(book_info.source_name)
 331             if match is not None:
 332                 pd = str(match.groups()[0])
 333         if not pd:
 334             pd = ""
 335         fields["published_date"] = pd
 336
 337         return fields
 338
 339     # def add_gaps(self, fields, fieldname):
 340     #     """
 341     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 342     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 343     #     """
 344     #     def gap():
 345     #         while True:
 346     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 347     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 348
 349     def get_master(self, root):
 350         """
 351         Returns the first master tag from an etree.
 352         """
 353         for master in root.iter():
 354             if master.tag in self.master_tags:
 355                 return master
 356
 357     def index_content(self, book, book_fields):
 358         """
 359         Walks the book XML and extract content from it.
 360         Adds parts for each header tag and for each fragment.
 361         """
 362         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 363         root = wld.edoc.getroot()
 364
 365         master = self.get_master(root)
 366         if master is None:
 367             return []
 368
 369         def walker(node, ignore_tags=()):
 370             if node.tag not in ignore_tags:
 371                 yield node, None, None
 372                 if node.text is not None:
 373                     yield None, node.text, None
 374                 for child in list(node):
 375                     for b, t, e in walker(child):
 376                         yield b, t, e
 377                 yield None, None, node
 378
 379             if node.tail is not None:
 380                 yield None, node.tail, None
 381             return
 382
 383         def fix_format(text):
 384             # separator = [u" ", u"\t", u".", u";", u","]
 385             if isinstance(text, list):
 386                 # need to join it first
 387                 text = filter(lambda s: s is not None, content)
 388                 text = u' '.join(text)
 389                 # for i in range(len(text)):
 390                 #     if i > 0:
 391                 #         if text[i][0] not in separator\
 392                 #             and text[i - 1][-1] not in separator:
 393                 #          text.insert(i, u" ")
 394
 395             return re.sub("(?m)/$", "", text)
 396
 397         def add_part(snippets, **fields):
 398             doc = self.create_book_doc(book)
 399             for n, v in book_fields.items():
 400                 doc[n] = v
 401
 402             doc['header_index'] = fields["header_index"]
 403             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 404             doc['header_type'] = fields['header_type']
 405
 406             doc['text'] = fields['text']
 407
 408             # snippets
 409             snip_pos = snippets.add(fields["text"])
 410
 411             doc['snippets_position'] = snip_pos[0]
 412             doc['snippets_length'] = snip_pos[1]
 413             if snippets.revision:
 414                 doc["snippets_revision"] = snippets.revision
 415
 416             if 'fragment_anchor' in fields:
 417                 doc["fragment_anchor"] = fields['fragment_anchor']
 418
 419             if 'themes' in fields:
 420                 doc['themes'] = fields['themes']
 421             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 422                                          doc['header_span'],
 423                                          doc.get('fragment_anchor', ''))
 424             return doc
 425
 426         def give_me_utf8(s):
 427             if isinstance(s, unicode):
 428                 return s.encode('utf-8')
 429             else:
 430                 return s
 431
 432         fragments = {}
 433         snippets = Snippets(book.id).open('w')
 434         try:
 435             for header, position in zip(list(master), range(len(master))):
 436
 437                 if header.tag in self.skip_header_tags:
 438                     continue
 439                 if header.tag is etree.Comment:
 440                     continue
 441
 442                 # section content
 443                 content = []
 444                 footnote = []
 445
 446                 def all_content(text):
 447                     for frag in fragments.values():
 448                         frag['text'].append(text)
 449                     content.append(text)
 450                 handle_text = [all_content]
 451
 452                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 453                     # handle footnotes
 454                     if start is not None and start.tag in self.footnote_tags:
 455                         footnote = []
 456
 457                         def collect_footnote(t):
 458                             footnote.append(t)
 459
 460                         handle_text.append(collect_footnote)
 461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 462                         handle_text.pop()
 463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 464                                        text=u''.join(footnote),
 465                                        is_footnote=True)
 466                         self.index.add(doc)
 467                         footnote = []
 468
 469                     # handle fragments and themes.
 470                     if start is not None and start.tag == 'begin':
 471                         fid = start.attrib['id'][1:]
 472                         fragments[fid] = {
 473                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 474
 475                     # themes for this fragment
 476                     elif start is not None and start.tag == 'motyw':
 477                         fid = start.attrib['id'][1:]
 478                         handle_text.append(lambda text: None)
 479                         if start.text is not None:
 480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 481                     elif end is not None and end.tag == 'motyw':
 482                         handle_text.pop()
 483
 484                     elif start is not None and start.tag == 'end':
 485                         fid = start.attrib['id'][1:]
 486                         if fid not in fragments:
 487                             continue  # a broken <end> node, skip it
 488                         frag = fragments[fid]
 489                         if not frag['themes']:
 490                             continue  # empty themes list.
 491                         del fragments[fid]
 492
 493                         doc = add_part(snippets,
 494                                        header_type=frag['start_header'],
 495                                        header_index=frag['start_section'],
 496                                        header_span=position - frag['start_section'] + 1,
 497                                        fragment_anchor=fid,
 498                                        text=fix_format(frag['text']),
 499                                        themes=frag['themes'])
 500                         self.index.add(doc)
 501
 502                         # Collect content.
 503
 504                     if text is not None and handle_text is not []:
 505                         hdl = handle_text[-1]
 506                         hdl(text)
 507
 508                         # in the end, add a section text.
 509                 doc = add_part(snippets, header_index=position,
 510                                header_type=header.tag, text=fix_format(content))
 511
 512                 self.index.add(doc)
 513
 514         finally:
 515             snippets.close()
 516
 517
 518 class SearchResult(object):
 519     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 520         #        self.search = search
 521         self.boost = 1.0
 522         self._hits = []
 523         self._processed_hits = None  # processed hits
 524         self.snippets = []
 525         self.query_terms = query_terms
 526         self._book = None
 527
 528         if 'score' in doc:
 529             self._score = doc['score']
 530         else:
 531             self._score = 0
 532
 533         self.book_id = int(doc["book_id"])
 534
 535         try:
 536             self.published_date = int(doc.get("published_date"))
 537         except ValueError:
 538             self.published_date = 0
 539
 540         # content hits
 541         header_type = doc.get("header_type", None)
 542         # we have a content hit in some header of fragment
 543         if header_type is not None:
 544             sec = (header_type, int(doc["header_index"]))
 545             header_span = doc['header_span']
 546             header_span = header_span is not None and int(header_span) or 1
 547             fragment = doc.get("fragment_anchor", None)
 548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 549             snippets_rev = doc.get('snippets_revision', None)
 550
 551             hit = (sec + (header_span,), fragment, self._score, {
 552                 'how_found': how_found,
 553                 'snippets_pos': snippets_pos,
 554                 'snippets_revision': snippets_rev,
 555                 'themes': doc.get('themes', []),
 556                 'themes_pl': doc.get('themes_pl', [])
 557                 })
 558
 559             self._hits.append(hit)
 560
 561     def __unicode__(self):
 562         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 563             (self.book_id, len(self._hits),
 564              len(self._processed_hits) if self._processed_hits else -1,
 565              self._score, len(self.snippets))
 566
 567     def __str__(self):
 568         return unicode(self).encode('utf-8')
 569
 570     @property
 571     def score(self):
 572         return self._score * self.boost
 573
 574     def merge(self, other):
 575         if self.book_id != other.book_id:
 576             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 577         self._hits += other._hits
 578         if other.score > self.score:
 579             self._score = other._score
 580         return self
 581
 582     def get_book(self):
 583         if self._book is not None:
 584             return self._book
 585         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 586         return self._book
 587
 588     book = property(get_book)
 589
 590     POSITION = 0
 591     FRAGMENT = 1
 592     POSITION_INDEX = 1
 593     POSITION_SPAN = 2
 594     SCORE = 2
 595     OTHER = 3
 596
 597     @property
 598     def hits(self):
 599         if self._processed_hits is not None:
 600             return self._processed_hits
 601
 602         # to sections and fragments
 603         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 604
 605         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 606
 607         # sections not covered by fragments
 608         sect = filter(lambda s: 0 == len(filter(
 609             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 610                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 611
 612         def remove_duplicates(lst, keyfn, compare):
 613             els = {}
 614             for e in lst:
 615                 eif = keyfn(e)
 616                 if eif in els:
 617                     if compare(els[eif], e) >= 1:
 618                         continue
 619                 els[eif] = e
 620             return els.values()
 621
 622         # remove fragments with duplicated fid's and duplicated snippets
 623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 626
 627         # remove duplicate sections
 628         sections = {}
 629
 630         for s in sect:
 631             si = s[self.POSITION][self.POSITION_INDEX]
 632             # skip existing
 633             if si in sections:
 634                 if sections[si]['score'] >= s[self.SCORE]:
 635                     continue
 636
 637             m = {'score': s[self.SCORE],
 638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 639                  }
 640             m.update(s[self.OTHER])
 641             sections[si] = m
 642
 643         hits = sections.values()
 644
 645         for f in frags:
 646             try:
 647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 648             except catalogue.models.Fragment.DoesNotExist:
 649                 # stale index
 650                 continue
 651             # Figure out if we were searching for a token matching some word in theme name.
 652             themes = frag.tags.filter(category='theme')
 653             themes_hit = set()
 654             if self.query_terms is not None:
 655                 for i in range(0, len(f[self.OTHER]['themes'])):
 656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 657                     tms = map(unicode.lower, tms)
 658                     for qt in self.query_terms:
 659                         if qt in tms:
 660                             themes_hit.add(f[self.OTHER]['themes'][i])
 661                             break
 662
 663             def theme_by_name(n):
 664                 th = filter(lambda t: t.name == n, themes)
 665                 if th:
 666                     return th[0]
 667                 else:
 668                     return None
 669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 670
 671             m = {'score': f[self.SCORE],
 672                  'fragment': frag,
 673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 674                  'themes': themes,
 675                  'themes_hit': themes_hit
 676                  }
 677             m.update(f[self.OTHER])
 678             hits.append(m)
 679
 680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 681
 682         self._processed_hits = hits
 683
 684         return hits
 685
 686     @staticmethod
 687     def aggregate(*result_lists):
 688         books = {}
 689         for rl in result_lists:
 690             for r in rl:
 691                 if r.book_id in books:
 692                     books[r.book_id].merge(r)
 693                 else:
 694                     books[r.book_id] = r
 695         return books.values()
 696
 697     def __cmp__(self, other):
 698         c = cmp(self.score, other.score)
 699         if c == 0:
 700             # this is inverted, because earlier date is better
 701             return cmp(other.published_date, self.published_date)
 702         else:
 703             return c
 704
 705     def __len__(self):
 706         return len(self.hits)
 707
 708     def snippet_pos(self, idx=0):
 709         return self.hits[idx]['snippets_pos']
 710
 711     def snippet_revision(self, idx=0):
 712         try:
 713             return self.hits[idx]['snippets_revision']
 714         except (IndexError, KeyError):
 715             return None
 716
 717
 718 class Search(SolrIndex):
 719     """
 720     Search facilities.
 721     """
 722     def __init__(self, default_field="text"):
 723         super(Search, self).__init__(mode='r')
 724
 725     def make_term_query(self, query, field='text', modal=operator.or_):
 726         """
 727         Returns term queries joined by boolean query.
 728         modal - applies to boolean query
 729         fuzzy - should the query by fuzzy.
 730         """
 731         if query is None:
 732             query = ''
 733         q = self.index.Q()
 734         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 735
 736         return q
 737
 738     def search_phrase(self, searched, field='text', book=False,
 739                       filters=None,
 740                       snippets=False):
 741         if filters is None:
 742             filters = []
 743         if book:
 744             filters.append(self.index.Q(is_book=True))
 745
 746         q = self.index.query(**{field: searched})
 747         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 748         res = q.execute()
 749         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 750
 751     def search_some(self, searched, fields, book=True,
 752                     filters=None, snippets=True, query_terms=None):
 753         assert isinstance(fields, list)
 754         if filters is None:
 755             filters = []
 756         if book:
 757             filters.append(self.index.Q(is_book=True))
 758
 759         query = self.index.Q()
 760
 761         for fld in fields:
 762             query = self.index.Q(query | self.make_term_query(searched, fld))
 763
 764         query = self.index.query(query)
 765         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 766         res = query.execute()
 767         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 768
 769     def search_everywhere(self, searched, query_terms=None):
 770         """
 771         Tries to use search terms to match different fields of book (or its parts).
 772         E.g. one word can be an author survey, another be a part of the title, and the rest
 773         are some words from third chapter.
 774         """
 775         books = []
 776         # content only query : themes x content
 777         q = self.make_term_query(searched, 'text')
 778         q_themes = self.make_term_query(searched, 'themes_pl')
 779
 780         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 781         res = query.execute()
 782
 783         for found in res:
 784             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 785
 786         # query themes/content x author/title/tags
 787         in_content = self.index.Q()
 788         in_meta = self.index.Q()
 789
 790         for fld in ['themes_pl', 'text']:
 791             in_content |= self.make_term_query(searched, field=fld)
 792
 793         for fld in ['tags', 'authors', 'title']:
 794             in_meta |= self.make_term_query(searched, field=fld)
 795
 796         q = in_content & in_meta
 797         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 798
 799         for found in res:
 800             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 801
 802         return books
 803
 804     def get_snippets(self, searchresult, query, field='text', num=1):
 805         """
 806         Returns a snippet for found scoreDoc.
 807         """
 808         maxnum = len(searchresult)
 809         if num is None or num < 0 or num > maxnum:
 810             num = maxnum
 811         book_id = searchresult.book_id
 812         revision = searchresult.snippet_revision()
 813         snippets = Snippets(book_id, revision=revision)
 814         snips = [None] * maxnum
 815         try:
 816             snippets.open()
 817             idx = 0
 818             while idx < maxnum and num > 0:
 819                 position, length = searchresult.snippet_pos(idx)
 820                 if position is None or length is None:
 821                     continue
 822                 text = snippets.get((int(position),
 823                                      int(length)))
 824                 snip = self.index.highlight(text=text, field=field, q=query)
 825                 snips[idx] = snip
 826                 if snip:
 827                     num -= 1
 828                 idx += 1
 829
 830         except IOError, e:
 831             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 832             return []
 833         finally:
 834             snippets.close()
 835
 836             # remove verse end markers..
 837         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 838
 839         searchresult.snippets = snips
 840
 841         return snips
 842
 843     def hint_tags(self, query, pdcounter=True, prefix=True):
 844         """
 845         Return auto-complete hints for tags
 846         using prefix search.
 847         """
 848         q = self.index.Q()
 849         query = query.strip()
 850         for field in ['tag_name', 'tag_name_pl']:
 851             if prefix:
 852                 q |= self.index.Q(**{field: query + "*"})
 853             else:
 854                 q |= self.make_term_query(query, field=field)
 855         qu = self.index.query(q)
 856
 857         return self.search_tags(qu, pdcounter=pdcounter)
 858
 859     def search_tags(self, query, filters=None, pdcounter=False):
 860         """
 861         Search for Tag objects using query.
 862         """
 863         if not filters:
 864             filters = []
 865         if not pdcounter:
 866             filters.append(~self.index.Q(is_pdcounter=True))
 867         res = self.apply_filters(query, filters).execute()
 868
 869         tags = []
 870         pd_tags = []
 871
 872         for doc in res:
 873             is_pdcounter = doc.get('is_pdcounter', False)
 874             category = doc.get('tag_category')
 875             try:
 876                 if is_pdcounter:
 877                     if category == 'pd_author':
 878                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 879                     elif category == 'pd_book':
 880                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 881                         tag.category = 'pd_book'  # make it look more lik a tag.
 882                     else:
 883                         # WTF
 884                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
 885                             int(doc.get('tag_id')), category)).encode('utf-8')
 886                     pd_tags.append(tag)
 887                 else:
 888                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 889                     tags.append(tag)
 890
 891             except catalogue.models.Tag.DoesNotExist:
 892                 pass
 893             except PDCounterAuthor.DoesNotExist:
 894                 pass
 895             except PDCounterBook.DoesNotExist:
 896                 pass
 897
 898         tags_slugs = set(map(lambda t: t.slug, tags))
 899         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
 900
 901         log.debug('search_tags: %s' % tags)
 902
 903         return tags
 904
 905     def hint_books(self, query, prefix=True):
 906         """
 907         Returns auto-complete hints for book titles
 908         Because we do not index 'pseudo' title-tags.
 909         Prefix search.
 910         """
 911         q = self.index.Q()
 912         query = query.strip()
 913         if prefix:
 914             q |= self.index.Q(title=query + "*")
 915         else:
 916             q |= self.make_term_query(query, field='title')
 917         qu = self.index.query(q)
 918         only_books = self.index.Q(is_book=True)
 919         return self.search_books(qu, [only_books])
 920
 921     def search_books(self, query, filters=None, max_results=10):
 922         """
 923         Searches for Book objects using query
 924         """
 925         bks = []
 926         bks_found = set()
 927         query = query.query(is_book=True)
 928         res = self.apply_filters(query, filters).field_limit(['book_id'])
 929         for r in res:
 930             try:
 931                 bid = r['book_id']
 932                 if bid not in bks_found:
 933                     bks.append(catalogue.models.Book.objects.get(id=bid))
 934                     bks_found.add(bid)
 935             except catalogue.models.Book.DoesNotExist:
 936                 pass
 937         return bks
 938
 939     @staticmethod
 940     def apply_filters(query, filters):
 941         """
 942         Apply filters to a query
 943         """
 944         if filters is None:
 945             filters = []
 946         filters = filter(lambda x: x is not None, filters)
 947         for f in filters:
 948             query = query.query(f)
 949         return query
 950
 951
 952 if getattr(settings, 'SEARCH_MOCK', False):
 953     from .mock_search import Search