src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23
  24 class SolrIndex(object):
  25     def __init__(self, mode=None):
  26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  27
  28
  29 class Snippets(object):
  30     """
  31     This class manages snippet files for indexed object (book)
  32     the snippets are concatenated together, and their positions and
  33     lengths are kept in lucene index fields.
  34     """
  35     SNIPPET_DIR = "snippets"
  36
  37     def __init__(self, book_id, revision=None):
  38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         self.book_id = book_id
  40         self.revision = revision
  41         self.file = None
  42         self.position = None
  43
  44     @property
  45     def path(self):
  46         if self.revision:
  47             fn = "%d.%d" % (self.book_id, self.revision)
  48         else:
  49             fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if 'b' not in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         if self.file:
  96             self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         log.debug("Indexing tags")
 149         remove_only = kw.get('remove_only', False)
 150         # first, remove tags from index.
 151         if tags:
 152             tag_qs = []
 153             for tag in tags:
 154                 q_id = self.index.Q(tag_id=tag.id)
 155
 156                 if isinstance(tag, PDCounterAuthor):
 157                     q_cat = self.index.Q(tag_category='pd_author')
 158                 elif isinstance(tag, PDCounterBook):
 159                     q_cat = self.index.Q(tag_category='pd_book')
 160                 else:
 161                     q_cat = self.index.Q(tag_category=tag.category)
 162
 163                 q_id_cat = self.index.Q(q_id & q_cat)
 164                 tag_qs.append(q_id_cat)
 165             self.delete_query(*tag_qs)
 166         else:  # all
 167             q = self.index.Q(tag_id__any=True)
 168             self.delete_query(q)
 169
 170         if not remove_only:
 171             # then add them [all or just one passed]
 172             if not tags:
 173                 tags = chain(
 174                     catalogue.models.Tag.objects.exclude(category='set'),
 175                     PDCounterAuthor.objects.all(),
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {'book_id': int(book.id)}
 213         if book.parent is not None:
 214             doc['parent_id'] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 244         # let's not index it - it's only used for extracting publish date
 245         if 'source_name' in meta_fields:
 246             del meta_fields['source_name']
 247
 248         for n, f in meta_fields.items():
 249             book_doc[n] = f
 250
 251         book_doc['uid'] = "book%s" % book_doc['book_id']
 252         self.index.add(book_doc)
 253         del book_doc
 254         book_fields = {
 255             'title': meta_fields['title'],
 256             'authors': meta_fields['authors'],
 257             'published_date': meta_fields['published_date']
 258             }
 259
 260         if 'translators' in meta_fields:
 261             book_fields['translators'] = meta_fields['translators']
 262
 263         self.index_content(book, book_fields=book_fields)
 264
 265     master_tags = [
 266         'opowiadanie',
 267         'powiesc',
 268         'dramat_wierszowany_l',
 269         'dramat_wierszowany_lp',
 270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 271         'wywiad',
 272         ]
 273
 274     ignore_content_tags = [
 275         'uwaga', 'extra',
 276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 277         'didaskalia',
 278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 279         ]
 280
 281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 282
 283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 285
 286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 287
 288     def extract_metadata(self, book, book_info=None, dc_only=None):
 289         """
 290         Extract metadata from book and returns a map of fields keyed by fieldname
 291         """
 292         fields = {}
 293
 294         if book_info is None:
 295             book_info = dcparser.parse(open(book.xml_file.path))
 296
 297         fields['slug'] = book.slug
 298         fields['tags'] = [t.name for t in book.tags]
 299         fields['is_book'] = True
 300
 301         # validator, name
 302         for field in dcparser.BookInfo.FIELDS:
 303             if dc_only and field.name not in dc_only:
 304                 continue
 305             if hasattr(book_info, field.name):
 306                 if not getattr(book_info, field.name):
 307                     continue
 308                 # since no type information is available, we use validator
 309                 type_indicator = field.validator
 310                 if type_indicator == dcparser.as_unicode:
 311                     s = getattr(book_info, field.name)
 312                     if field.multiple:
 313                         s = ', '.join(s)
 314                     fields[field.name] = s
 315                 elif type_indicator == dcparser.as_person:
 316                     p = getattr(book_info, field.name)
 317                     if isinstance(p, dcparser.Person):
 318                         persons = unicode(p)
 319                     else:
 320                         persons = ', '.join(map(unicode, p))
 321                     fields[field.name] = persons
 322                 elif type_indicator == dcparser.as_date:
 323                     dt = getattr(book_info, field.name)
 324                     fields[field.name] = dt
 325
 326         # get published date
 327         pd = None
 328         if hasattr(book_info, 'source_name') and book_info.source_name:
 329             match = self.published_date_re.search(book_info.source_name)
 330             if match is not None:
 331                 pd = str(match.groups()[0])
 332         if not pd:
 333             pd = ""
 334         fields["published_date"] = pd
 335
 336         return fields
 337
 338     # def add_gaps(self, fields, fieldname):
 339     #     """
 340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 342     #     """
 343     #     def gap():
 344     #         while True:
 345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 347
 348     def get_master(self, root):
 349         """
 350         Returns the first master tag from an etree.
 351         """
 352         for master in root.iter():
 353             if master.tag in self.master_tags:
 354                 return master
 355
 356     def index_content(self, book, book_fields):
 357         """
 358         Walks the book XML and extract content from it.
 359         Adds parts for each header tag and for each fragment.
 360         """
 361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 362         root = wld.edoc.getroot()
 363
 364         master = self.get_master(root)
 365         if master is None:
 366             return []
 367
 368         def walker(node, ignore_tags=()):
 369             if node.tag not in ignore_tags:
 370                 yield node, None, None
 371                 if node.text is not None:
 372                     yield None, node.text, None
 373                 for child in list(node):
 374                     for b, t, e in walker(child):
 375                         yield b, t, e
 376                 yield None, None, node
 377
 378             if node.tail is not None:
 379                 yield None, node.tail, None
 380             return
 381
 382         def fix_format(text):
 383             # separator = [u" ", u"\t", u".", u";", u","]
 384             if isinstance(text, list):
 385                 # need to join it first
 386                 text = filter(lambda s: s is not None, content)
 387                 text = u' '.join(text)
 388                 # for i in range(len(text)):
 389                 #     if i > 0:
 390                 #         if text[i][0] not in separator\
 391                 #             and text[i - 1][-1] not in separator:
 392                 #          text.insert(i, u" ")
 393
 394             return re.sub("(?m)/$", "", text)
 395
 396         def add_part(snippets, **fields):
 397             doc = self.create_book_doc(book)
 398             for n, v in book_fields.items():
 399                 doc[n] = v
 400
 401             doc['header_index'] = fields["header_index"]
 402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 403             doc['header_type'] = fields['header_type']
 404
 405             doc['text'] = fields['text']
 406
 407             # snippets
 408             snip_pos = snippets.add(fields["text"])
 409
 410             doc['snippets_position'] = snip_pos[0]
 411             doc['snippets_length'] = snip_pos[1]
 412             if snippets.revision:
 413                 doc["snippets_revision"] = snippets.revision
 414
 415             if 'fragment_anchor' in fields:
 416                 doc["fragment_anchor"] = fields['fragment_anchor']
 417
 418             if 'themes' in fields:
 419                 doc['themes'] = fields['themes']
 420             doc['uid'] = "part%s-%s-%s-%s" % (
 421                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 422             return doc
 423
 424         def give_me_utf8(s):
 425             if isinstance(s, unicode):
 426                 return s.encode('utf-8')
 427             else:
 428                 return s
 429
 430         fragments = {}
 431         snippets = Snippets(book.id).open('w')
 432         try:
 433             for header, position in zip(list(master), range(len(master))):
 434
 435                 if header.tag in self.skip_header_tags:
 436                     continue
 437                 if header.tag is etree.Comment:
 438                     continue
 439
 440                 # section content
 441                 content = []
 442                 footnote = []
 443
 444                 def all_content(text):
 445                     for frag in fragments.values():
 446                         frag['text'].append(text)
 447                     content.append(text)
 448                 handle_text = [all_content]
 449
 450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 451                     # handle footnotes
 452                     if start is not None and start.tag in self.footnote_tags:
 453                         footnote = []
 454
 455                         def collect_footnote(t):
 456                             footnote.append(t)
 457
 458                         handle_text.append(collect_footnote)
 459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 460                         handle_text.pop()
 461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 462                                        text=u''.join(footnote),
 463                                        is_footnote=True)
 464                         self.index.add(doc)
 465                         footnote = []
 466
 467                     # handle fragments and themes.
 468                     if start is not None and start.tag == 'begin':
 469                         fid = start.attrib['id'][1:]
 470                         fragments[fid] = {
 471                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 472
 473                     # themes for this fragment
 474                     elif start is not None and start.tag == 'motyw':
 475                         fid = start.attrib['id'][1:]
 476                         handle_text.append(lambda text: None)
 477                         if start.text is not None:
 478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 479                     elif end is not None and end.tag == 'motyw':
 480                         handle_text.pop()
 481
 482                     elif start is not None and start.tag == 'end':
 483                         fid = start.attrib['id'][1:]
 484                         if fid not in fragments:
 485                             continue  # a broken <end> node, skip it
 486                         frag = fragments[fid]
 487                         if not frag['themes']:
 488                             continue  # empty themes list.
 489                         del fragments[fid]
 490
 491                         doc = add_part(snippets,
 492                                        header_type=frag['start_header'],
 493                                        header_index=frag['start_section'],
 494                                        header_span=position - frag['start_section'] + 1,
 495                                        fragment_anchor=fid,
 496                                        text=fix_format(frag['text']),
 497                                        themes=frag['themes'])
 498                         self.index.add(doc)
 499
 500                         # Collect content.
 501
 502                     if text is not None and handle_text is not []:
 503                         hdl = handle_text[-1]
 504                         hdl(text)
 505
 506                         # in the end, add a section text.
 507                 doc = add_part(snippets, header_index=position,
 508                                header_type=header.tag, text=fix_format(content))
 509
 510                 self.index.add(doc)
 511
 512         finally:
 513             snippets.close()
 514
 515
 516 class SearchResult(object):
 517     def __init__(self, doc, how_found=None, query_terms=None):
 518         self.boost = 1.0
 519         self._hits = []
 520         self._processed_hits = None  # processed hits
 521         self.snippets = []
 522         self.query_terms = query_terms
 523         self._book = None
 524
 525         if 'score' in doc:
 526             self._score = doc['score']
 527         else:
 528             self._score = 0
 529
 530         self.book_id = int(doc["book_id"])
 531
 532         try:
 533             self.published_date = int(doc.get("published_date"))
 534         except ValueError:
 535             self.published_date = 0
 536
 537         # content hits
 538         header_type = doc.get("header_type", None)
 539         # we have a content hit in some header of fragment
 540         if header_type is not None:
 541             sec = (header_type, int(doc["header_index"]))
 542             header_span = doc['header_span']
 543             header_span = header_span is not None and int(header_span) or 1
 544             fragment = doc.get("fragment_anchor", None)
 545             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 546             snippets_rev = doc.get('snippets_revision', None)
 547
 548             hit = (sec + (header_span,), fragment, self._score, {
 549                 'how_found': how_found,
 550                 'snippets_pos': snippets_pos,
 551                 'snippets_revision': snippets_rev,
 552                 'themes': doc.get('themes', []),
 553                 'themes_pl': doc.get('themes_pl', [])
 554                 })
 555
 556             self._hits.append(hit)
 557
 558     def __unicode__(self):
 559         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 560             (self.book_id, len(self._hits),
 561              len(self._processed_hits) if self._processed_hits else -1,
 562              self._score, len(self.snippets))
 563
 564     def __str__(self):
 565         return unicode(self).encode('utf-8')
 566
 567     @property
 568     def score(self):
 569         return self._score * self.boost
 570
 571     def merge(self, other):
 572         if self.book_id != other.book_id:
 573             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 574         self._hits += other._hits
 575         if other.score > self.score:
 576             self._score = other._score
 577         return self
 578
 579     def get_book(self):
 580         if self._book is not None:
 581             return self._book
 582         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 583         return self._book
 584
 585     book = property(get_book)
 586
 587     POSITION = 0
 588     FRAGMENT = 1
 589     POSITION_INDEX = 1
 590     POSITION_SPAN = 2
 591     SCORE = 2
 592     OTHER = 3
 593
 594     @property
 595     def hits(self):
 596         if self._processed_hits is not None:
 597             return self._processed_hits
 598
 599         # to sections and fragments
 600         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 601
 602         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 603
 604         # sections not covered by fragments
 605         sect = filter(lambda s: 0 == len(filter(
 606             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 607                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 608
 609         def remove_duplicates(lst, keyfn, compare):
 610             els = {}
 611             for e in lst:
 612                 eif = keyfn(e)
 613                 if eif in els:
 614                     if compare(els[eif], e) >= 1:
 615                         continue
 616                 els[eif] = e
 617             return els.values()
 618
 619         # remove fragments with duplicated fid's and duplicated snippets
 620         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 621         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 622         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 623
 624         # remove duplicate sections
 625         sections = {}
 626
 627         for s in sect:
 628             si = s[self.POSITION][self.POSITION_INDEX]
 629             # skip existing
 630             if si in sections:
 631                 if sections[si]['score'] >= s[self.SCORE]:
 632                     continue
 633
 634             m = {'score': s[self.SCORE],
 635                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 636                  }
 637             m.update(s[self.OTHER])
 638             sections[si] = m
 639
 640         hits = sections.values()
 641
 642         for f in frags:
 643             try:
 644                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 645             except catalogue.models.Fragment.DoesNotExist:
 646                 # stale index
 647                 continue
 648             # Figure out if we were searching for a token matching some word in theme name.
 649             themes = frag.tags.filter(category='theme')
 650             themes_hit = set()
 651             if self.query_terms is not None:
 652                 for i in range(0, len(f[self.OTHER]['themes'])):
 653                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 654                     tms = map(unicode.lower, tms)
 655                     for qt in self.query_terms:
 656                         if qt in tms:
 657                             themes_hit.add(f[self.OTHER]['themes'][i])
 658                             break
 659
 660             def theme_by_name(n):
 661                 th = filter(lambda t: t.name == n, themes)
 662                 if th:
 663                     return th[0]
 664                 else:
 665                     return None
 666             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 667
 668             m = {'score': f[self.SCORE],
 669                  'fragment': frag,
 670                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 671                  'themes': themes,
 672                  'themes_hit': themes_hit
 673                  }
 674             m.update(f[self.OTHER])
 675             hits.append(m)
 676
 677         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 678
 679         self._processed_hits = hits
 680
 681         return hits
 682
 683     @staticmethod
 684     def aggregate(*result_lists):
 685         books = {}
 686         for rl in result_lists:
 687             for r in rl:
 688                 if r.book_id in books:
 689                     books[r.book_id].merge(r)
 690                 else:
 691                     books[r.book_id] = r
 692         return books.values()
 693
 694     def __cmp__(self, other):
 695         c = cmp(self.score, other.score)
 696         if c == 0:
 697             # this is inverted, because earlier date is better
 698             return cmp(other.published_date, self.published_date)
 699         else:
 700             return c
 701
 702     def __len__(self):
 703         return len(self.hits)
 704
 705     def snippet_pos(self, idx=0):
 706         return self.hits[idx]['snippets_pos']
 707
 708     def snippet_revision(self, idx=0):
 709         try:
 710             return self.hits[idx]['snippets_revision']
 711         except (IndexError, KeyError):
 712             return None
 713
 714
 715 class Search(SolrIndex):
 716     """
 717     Search facilities.
 718     """
 719     def __init__(self, default_field="text"):
 720         super(Search, self).__init__(mode='r')
 721
 722     def make_term_query(self, query, field='text', modal=operator.or_):
 723         """
 724         Returns term queries joined by boolean query.
 725         modal - applies to boolean query
 726         fuzzy - should the query by fuzzy.
 727         """
 728         if query is None:
 729             query = ''
 730         q = self.index.Q()
 731         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 732
 733         return q
 734
 735     def search_phrase(self, searched, field='text', book=False,
 736                       filters=None,
 737                       snippets=False):
 738         if filters is None:
 739             filters = []
 740         if book:
 741             filters.append(self.index.Q(is_book=True))
 742
 743         q = self.index.query(**{field: searched})
 744         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 745         res = q.paginate(rows=100).execute()
 746         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 747
 748     def search_some(self, searched, fields, book=True,
 749                     filters=None, snippets=True, query_terms=None):
 750         assert isinstance(fields, list)
 751         if filters is None:
 752             filters = []
 753         if book:
 754             filters.append(self.index.Q(is_book=True))
 755
 756         query = self.index.Q()
 757
 758         for fld in fields:
 759             query = self.index.Q(query | self.make_term_query(searched, fld))
 760
 761         query = self.index.query(query)
 762         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 763         res = query.execute()
 764         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 765
 766     def search_everywhere(self, searched, query_terms=None):
 767         """
 768         Tries to use search terms to match different fields of book (or its parts).
 769         E.g. one word can be an author survey, another be a part of the title, and the rest
 770         are some words from third chapter.
 771         """
 772         books = []
 773         # content only query : themes x content
 774         q = self.make_term_query(searched, 'text')
 775         q_themes = self.make_term_query(searched, 'themes_pl')
 776
 777         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 778         res = query.execute()
 779
 780         for found in res:
 781             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 782
 783         # query themes/content x author/title/tags
 784         in_content = self.index.Q()
 785         in_meta = self.index.Q()
 786
 787         for fld in ['themes_pl', 'text']:
 788             in_content |= self.make_term_query(searched, field=fld)
 789
 790         for fld in ['tags', 'authors', 'title']:
 791             in_meta |= self.make_term_query(searched, field=fld)
 792
 793         q = in_content & in_meta
 794         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 795
 796         for found in res:
 797             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 798
 799         return books
 800
 801     def get_snippets(self, searchresult, query, field='text', num=1):
 802         """
 803         Returns a snippet for found scoreDoc.
 804         """
 805         maxnum = len(searchresult)
 806         if num is None or num < 0 or num > maxnum:
 807             num = maxnum
 808         book_id = searchresult.book_id
 809         revision = searchresult.snippet_revision()
 810         snippets = Snippets(book_id, revision=revision)
 811         snips = [None] * maxnum
 812         try:
 813             snippets.open()
 814             idx = 0
 815             while idx < maxnum and num > 0:
 816                 position, length = searchresult.snippet_pos(idx)
 817                 if position is None or length is None:
 818                     continue
 819                 text = snippets.get((int(position),
 820                                      int(length)))
 821                 snip = self.index.highlight(text=text, field=field, q=query)
 822                 snips[idx] = snip
 823                 if snip:
 824                     num -= 1
 825                 idx += 1
 826
 827         except IOError, e:
 828             book = catalogue.models.Book.objects.filter(id=book_id)
 829             if not book:
 830                 log.error("Book does not exist for book id = %d" % book_id)
 831             elif not book.get().children.exists():
 832                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 833             return []
 834         finally:
 835             snippets.close()
 836
 837             # remove verse end markers..
 838         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 839
 840         searchresult.snippets = snips
 841
 842         return snips
 843
 844     def hint_tags(self, query, pdcounter=True, prefix=True):
 845         """
 846         Return auto-complete hints for tags
 847         using prefix search.
 848         """
 849         q = self.index.Q()
 850         query = query.strip()
 851         for field in ['tag_name', 'tag_name_pl']:
 852             if prefix:
 853                 q |= self.index.Q(**{field: query + "*"})
 854             else:
 855                 q |= self.make_term_query(query, field=field)
 856         qu = self.index.query(q)
 857
 858         return self.search_tags(qu, pdcounter=pdcounter)
 859
 860     def search_tags(self, query, filters=None, pdcounter=False):
 861         """
 862         Search for Tag objects using query.
 863         """
 864         if not filters:
 865             filters = []
 866         if not pdcounter:
 867             filters.append(~self.index.Q(is_pdcounter=True))
 868         res = self.apply_filters(query, filters).execute()
 869
 870         tags = []
 871         pd_tags = []
 872
 873         for doc in res:
 874             is_pdcounter = doc.get('is_pdcounter', False)
 875             category = doc.get('tag_category')
 876             try:
 877                 if is_pdcounter:
 878                     if category == 'pd_author':
 879                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 880                     elif category == 'pd_book':
 881                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 882                         tag.category = 'pd_book'  # make it look more lik a tag.
 883                     else:
 884                         # WTF
 885                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
 886                             int(doc.get('tag_id')), category)).encode('utf-8')
 887                     pd_tags.append(tag)
 888                 else:
 889                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 890                     tags.append(tag)
 891
 892             except catalogue.models.Tag.DoesNotExist:
 893                 pass
 894             except PDCounterAuthor.DoesNotExist:
 895                 pass
 896             except PDCounterBook.DoesNotExist:
 897                 pass
 898
 899         tags_slugs = set(map(lambda t: t.slug, tags))
 900         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
 901
 902         log.debug('search_tags: %s' % tags)
 903
 904         return tags
 905
 906     def hint_books(self, query, prefix=True):
 907         """
 908         Returns auto-complete hints for book titles
 909         Because we do not index 'pseudo' title-tags.
 910         Prefix search.
 911         """
 912         q = self.index.Q()
 913         query = query.strip()
 914         if prefix:
 915             q |= self.index.Q(title=query + "*")
 916             q |= self.index.Q(title_orig=query + "*")
 917         else:
 918             q |= self.make_term_query(query, field='title')
 919             q |= self.make_term_query(query, field='title_orig')
 920         qu = self.index.query(q)
 921         only_books = self.index.Q(is_book=True)
 922         return self.search_books(qu, [only_books])
 923
 924     def search_books(self, query, filters=None, max_results=10):
 925         """
 926         Searches for Book objects using query
 927         """
 928         bks = []
 929         bks_found = set()
 930         query = query.query(is_book=True)
 931         res = self.apply_filters(query, filters).field_limit(['book_id'])
 932         for r in res:
 933             try:
 934                 bid = r['book_id']
 935                 if bid not in bks_found:
 936                     bks.append(catalogue.models.Book.objects.get(id=bid))
 937                     bks_found.add(bid)
 938             except catalogue.models.Book.DoesNotExist:
 939                 pass
 940         return bks
 941
 942     @staticmethod
 943     def apply_filters(query, filters):
 944         """
 945         Apply filters to a query
 946         """
 947         if filters is None:
 948             filters = []
 949         filters = filter(lambda x: x is not None, filters)
 950         for f in filters:
 951             query = query.query(f)
 952         return query
 953
 954
 955 if getattr(settings, 'SEARCH_MOCK', False):
 956     from .mock_search import Search