src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23
  24 class SolrIndex(object):
  25     def __init__(self, mode=None):
  26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  27
  28
  29 class Snippets(object):
  30     """
  31     This class manages snippet files for indexed object (book)
  32     the snippets are concatenated together, and their positions and
  33     lengths are kept in lucene index fields.
  34     """
  35     SNIPPET_DIR = "snippets"
  36
  37     def __init__(self, book_id, revision=None):
  38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         self.book_id = book_id
  40         self.revision = revision
  41         self.file = None
  42         self.position = None
  43
  44     @property
  45     def path(self):
  46         if self.revision:
  47             fn = "%d.%d" % (self.book_id, self.revision)
  48         else:
  49             fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if 'b' not in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         if self.file:
  96             self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         log.debug("Indexing tags")
 149         remove_only = kw.get('remove_only', False)
 150         # first, remove tags from index.
 151         if tags:
 152             tag_qs = []
 153             for tag in tags:
 154                 q_id = self.index.Q(tag_id=tag.id)
 155
 156                 if isinstance(tag, PDCounterAuthor):
 157                     q_cat = self.index.Q(tag_category='pd_author')
 158                 elif isinstance(tag, PDCounterBook):
 159                     q_cat = self.index.Q(tag_category='pd_book')
 160                 else:
 161                     q_cat = self.index.Q(tag_category=tag.category)
 162
 163                 q_id_cat = self.index.Q(q_id & q_cat)
 164                 tag_qs.append(q_id_cat)
 165             self.delete_query(*tag_qs)
 166         else:  # all
 167             q = self.index.Q(tag_id__any=True)
 168             self.delete_query(q)
 169
 170         if not remove_only:
 171             # then add them [all or just one passed]
 172             if not tags:
 173                 tags = chain(
 174                     catalogue.models.Tag.objects.exclude(category='set'),
 175                     PDCounterAuthor.objects.all(),
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {'book_id': int(book.id)}
 213         if book.parent is not None:
 214             doc['parent_id'] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 244         # let's not index it - it's only used for extracting publish date
 245         if 'source_name' in meta_fields:
 246             del meta_fields['source_name']
 247
 248         for n, f in meta_fields.items():
 249             book_doc[n] = f
 250
 251         book_doc['uid'] = "book%s" % book_doc['book_id']
 252         self.index.add(book_doc)
 253         del book_doc
 254         book_fields = {
 255             'title': meta_fields['title'],
 256             'authors': meta_fields['authors'],
 257             'published_date': meta_fields['published_date']
 258             }
 259
 260         if 'translators' in meta_fields:
 261             book_fields['translators'] = meta_fields['translators']
 262
 263         self.index_content(book, book_fields=book_fields)
 264
 265     master_tags = [
 266         'opowiadanie',
 267         'powiesc',
 268         'dramat_wierszowany_l',
 269         'dramat_wierszowany_lp',
 270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 271         'wywiad',
 272         ]
 273
 274     ignore_content_tags = [
 275         'uwaga', 'extra',
 276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 277         'didaskalia',
 278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 279         ]
 280
 281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 282
 283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 285
 286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 287
 288     def extract_metadata(self, book, book_info=None, dc_only=None):
 289         """
 290         Extract metadata from book and returns a map of fields keyed by fieldname
 291         """
 292         fields = {}
 293
 294         if book_info is None:
 295             book_info = dcparser.parse(open(book.xml_file.path))
 296
 297         fields['slug'] = book.slug
 298         fields['tags'] = [t.name for t in book.tags]
 299         fields['is_book'] = True
 300
 301         # validator, name
 302         for field in dcparser.BookInfo.FIELDS:
 303             if dc_only and field.name not in dc_only:
 304                 continue
 305             if hasattr(book_info, field.name):
 306                 if not getattr(book_info, field.name):
 307                     continue
 308                 # since no type information is available, we use validator
 309                 type_indicator = field.validator
 310                 if type_indicator == dcparser.as_unicode:
 311                     s = getattr(book_info, field.name)
 312                     if field.multiple:
 313                         s = ', '.join(s)
 314                     fields[field.name] = s
 315                 elif type_indicator == dcparser.as_person:
 316                     p = getattr(book_info, field.name)
 317                     if isinstance(p, dcparser.Person):
 318                         persons = unicode(p)
 319                     else:
 320                         persons = ', '.join(map(unicode, p))
 321                     fields[field.name] = persons
 322                 elif type_indicator == dcparser.as_date:
 323                     dt = getattr(book_info, field.name)
 324                     fields[field.name] = dt
 325
 326         # get published date
 327         pd = None
 328         if hasattr(book_info, 'source_name') and book_info.source_name:
 329             match = self.published_date_re.search(book_info.source_name)
 330             if match is not None:
 331                 pd = str(match.groups()[0])
 332         if not pd:
 333             pd = ""
 334         fields["published_date"] = pd
 335
 336         return fields
 337
 338     # def add_gaps(self, fields, fieldname):
 339     #     """
 340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 342     #     """
 343     #     def gap():
 344     #         while True:
 345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 347
 348     def get_master(self, root):
 349         """
 350         Returns the first master tag from an etree.
 351         """
 352         for master in root.iter():
 353             if master.tag in self.master_tags:
 354                 return master
 355
 356     def index_content(self, book, book_fields):
 357         """
 358         Walks the book XML and extract content from it.
 359         Adds parts for each header tag and for each fragment.
 360         """
 361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 362         root = wld.edoc.getroot()
 363
 364         master = self.get_master(root)
 365         if master is None:
 366             return []
 367
 368         def walker(node, ignore_tags=()):
 369             if node.tag not in ignore_tags:
 370                 yield node, None, None
 371                 if node.text is not None:
 372                     yield None, node.text, None
 373                 for child in list(node):
 374                     for b, t, e in walker(child):
 375                         yield b, t, e
 376                 yield None, None, node
 377
 378             if node.tail is not None:
 379                 yield None, node.tail, None
 380             return
 381
 382         def fix_format(text):
 383             # separator = [u" ", u"\t", u".", u";", u","]
 384             if isinstance(text, list):
 385                 # need to join it first
 386                 text = filter(lambda s: s is not None, content)
 387                 text = u' '.join(text)
 388                 # for i in range(len(text)):
 389                 #     if i > 0:
 390                 #         if text[i][0] not in separator\
 391                 #             and text[i - 1][-1] not in separator:
 392                 #          text.insert(i, u" ")
 393
 394             return re.sub("(?m)/$", "", text)
 395
 396         def add_part(snippets, **fields):
 397             doc = self.create_book_doc(book)
 398             for n, v in book_fields.items():
 399                 doc[n] = v
 400
 401             doc['header_index'] = fields["header_index"]
 402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 403             doc['header_type'] = fields['header_type']
 404
 405             doc['text'] = fields['text']
 406
 407             # snippets
 408             snip_pos = snippets.add(fields["text"])
 409
 410             doc['snippets_position'] = snip_pos[0]
 411             doc['snippets_length'] = snip_pos[1]
 412             if snippets.revision:
 413                 doc["snippets_revision"] = snippets.revision
 414
 415             if 'fragment_anchor' in fields:
 416                 doc["fragment_anchor"] = fields['fragment_anchor']
 417
 418             if 'themes' in fields:
 419                 doc['themes'] = fields['themes']
 420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 421                                          doc['header_span'],
 422                                          doc.get('fragment_anchor', ''))
 423             return doc
 424
 425         def give_me_utf8(s):
 426             if isinstance(s, unicode):
 427                 return s.encode('utf-8')
 428             else:
 429                 return s
 430
 431         fragments = {}
 432         snippets = Snippets(book.id).open('w')
 433         try:
 434             for header, position in zip(list(master), range(len(master))):
 435
 436                 if header.tag in self.skip_header_tags:
 437                     continue
 438                 if header.tag is etree.Comment:
 439                     continue
 440
 441                 # section content
 442                 content = []
 443                 footnote = []
 444
 445                 def all_content(text):
 446                     for frag in fragments.values():
 447                         frag['text'].append(text)
 448                     content.append(text)
 449                 handle_text = [all_content]
 450
 451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 452                     # handle footnotes
 453                     if start is not None and start.tag in self.footnote_tags:
 454                         footnote = []
 455
 456                         def collect_footnote(t):
 457                             footnote.append(t)
 458
 459                         handle_text.append(collect_footnote)
 460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 461                         handle_text.pop()
 462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 463                                        text=u''.join(footnote),
 464                                        is_footnote=True)
 465                         self.index.add(doc)
 466                         footnote = []
 467
 468                     # handle fragments and themes.
 469                     if start is not None and start.tag == 'begin':
 470                         fid = start.attrib['id'][1:]
 471                         fragments[fid] = {
 472                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 473
 474                     # themes for this fragment
 475                     elif start is not None and start.tag == 'motyw':
 476                         fid = start.attrib['id'][1:]
 477                         handle_text.append(lambda text: None)
 478                         if start.text is not None:
 479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 480                     elif end is not None and end.tag == 'motyw':
 481                         handle_text.pop()
 482
 483                     elif start is not None and start.tag == 'end':
 484                         fid = start.attrib['id'][1:]
 485                         if fid not in fragments:
 486                             continue  # a broken <end> node, skip it
 487                         frag = fragments[fid]
 488                         if not frag['themes']:
 489                             continue  # empty themes list.
 490                         del fragments[fid]
 491
 492                         doc = add_part(snippets,
 493                                        header_type=frag['start_header'],
 494                                        header_index=frag['start_section'],
 495                                        header_span=position - frag['start_section'] + 1,
 496                                        fragment_anchor=fid,
 497                                        text=fix_format(frag['text']),
 498                                        themes=frag['themes'])
 499                         self.index.add(doc)
 500
 501                         # Collect content.
 502
 503                     if text is not None and handle_text is not []:
 504                         hdl = handle_text[-1]
 505                         hdl(text)
 506
 507                         # in the end, add a section text.
 508                 doc = add_part(snippets, header_index=position,
 509                                header_type=header.tag, text=fix_format(content))
 510
 511                 self.index.add(doc)
 512
 513         finally:
 514             snippets.close()
 515
 516
 517 class SearchResult(object):
 518     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 519         #        self.search = search
 520         self.boost = 1.0
 521         self._hits = []
 522         self._processed_hits = None  # processed hits
 523         self.snippets = []
 524         self.query_terms = query_terms
 525         self._book = None
 526
 527         if 'score' in doc:
 528             self._score = doc['score']
 529         else:
 530             self._score = 0
 531
 532         self.book_id = int(doc["book_id"])
 533
 534         try:
 535             self.published_date = int(doc.get("published_date"))
 536         except ValueError:
 537             self.published_date = 0
 538
 539         # content hits
 540         header_type = doc.get("header_type", None)
 541         # we have a content hit in some header of fragment
 542         if header_type is not None:
 543             sec = (header_type, int(doc["header_index"]))
 544             header_span = doc['header_span']
 545             header_span = header_span is not None and int(header_span) or 1
 546             fragment = doc.get("fragment_anchor", None)
 547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 548             snippets_rev = doc.get('snippets_revision', None)
 549
 550             hit = (sec + (header_span,), fragment, self._score, {
 551                 'how_found': how_found,
 552                 'snippets_pos': snippets_pos,
 553                 'snippets_revision': snippets_rev,
 554                 'themes': doc.get('themes', []),
 555                 'themes_pl': doc.get('themes_pl', [])
 556                 })
 557
 558             self._hits.append(hit)
 559
 560     def __unicode__(self):
 561         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 562             (self.book_id, len(self._hits),
 563              len(self._processed_hits) if self._processed_hits else -1,
 564              self._score, len(self.snippets))
 565
 566     def __str__(self):
 567         return unicode(self).encode('utf-8')
 568
 569     @property
 570     def score(self):
 571         return self._score * self.boost
 572
 573     def merge(self, other):
 574         if self.book_id != other.book_id:
 575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 576         self._hits += other._hits
 577         if other.score > self.score:
 578             self._score = other._score
 579         return self
 580
 581     def get_book(self):
 582         if self._book is not None:
 583             return self._book
 584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 585         return self._book
 586
 587     book = property(get_book)
 588
 589     POSITION = 0
 590     FRAGMENT = 1
 591     POSITION_INDEX = 1
 592     POSITION_SPAN = 2
 593     SCORE = 2
 594     OTHER = 3
 595
 596     @property
 597     def hits(self):
 598         if self._processed_hits is not None:
 599             return self._processed_hits
 600
 601         # to sections and fragments
 602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 603
 604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 605
 606         # sections not covered by fragments
 607         sect = filter(lambda s: 0 == len(filter(
 608             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 609                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 610
 611         def remove_duplicates(lst, keyfn, compare):
 612             els = {}
 613             for e in lst:
 614                 eif = keyfn(e)
 615                 if eif in els:
 616                     if compare(els[eif], e) >= 1:
 617                         continue
 618                 els[eif] = e
 619             return els.values()
 620
 621         # remove fragments with duplicated fid's and duplicated snippets
 622         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 623         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 624         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 625
 626         # remove duplicate sections
 627         sections = {}
 628
 629         for s in sect:
 630             si = s[self.POSITION][self.POSITION_INDEX]
 631             # skip existing
 632             if si in sections:
 633                 if sections[si]['score'] >= s[self.SCORE]:
 634                     continue
 635
 636             m = {'score': s[self.SCORE],
 637                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 638                  }
 639             m.update(s[self.OTHER])
 640             sections[si] = m
 641
 642         hits = sections.values()
 643
 644         for f in frags:
 645             try:
 646                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 647             except catalogue.models.Fragment.DoesNotExist:
 648                 # stale index
 649                 continue
 650             # Figure out if we were searching for a token matching some word in theme name.
 651             themes = frag.tags.filter(category='theme')
 652             themes_hit = set()
 653             if self.query_terms is not None:
 654                 for i in range(0, len(f[self.OTHER]['themes'])):
 655                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 656                     tms = map(unicode.lower, tms)
 657                     for qt in self.query_terms:
 658                         if qt in tms:
 659                             themes_hit.add(f[self.OTHER]['themes'][i])
 660                             break
 661
 662             def theme_by_name(n):
 663                 th = filter(lambda t: t.name == n, themes)
 664                 if th:
 665                     return th[0]
 666                 else:
 667                     return None
 668             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 669
 670             m = {'score': f[self.SCORE],
 671                  'fragment': frag,
 672                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 673                  'themes': themes,
 674                  'themes_hit': themes_hit
 675                  }
 676             m.update(f[self.OTHER])
 677             hits.append(m)
 678
 679         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 680
 681         self._processed_hits = hits
 682
 683         return hits
 684
 685     @staticmethod
 686     def aggregate(*result_lists):
 687         books = {}
 688         for rl in result_lists:
 689             for r in rl:
 690                 if r.book_id in books:
 691                     books[r.book_id].merge(r)
 692                 else:
 693                     books[r.book_id] = r
 694         return books.values()
 695
 696     def __cmp__(self, other):
 697         c = cmp(self.score, other.score)
 698         if c == 0:
 699             # this is inverted, because earlier date is better
 700             return cmp(other.published_date, self.published_date)
 701         else:
 702             return c
 703
 704     def __len__(self):
 705         return len(self.hits)
 706
 707     def snippet_pos(self, idx=0):
 708         return self.hits[idx]['snippets_pos']
 709
 710     def snippet_revision(self, idx=0):
 711         try:
 712             return self.hits[idx]['snippets_revision']
 713         except (IndexError, KeyError):
 714             return None
 715
 716
 717 class Search(SolrIndex):
 718     """
 719     Search facilities.
 720     """
 721     def __init__(self, default_field="text"):
 722         super(Search, self).__init__(mode='r')
 723
 724     def make_term_query(self, query, field='text', modal=operator.or_):
 725         """
 726         Returns term queries joined by boolean query.
 727         modal - applies to boolean query
 728         fuzzy - should the query by fuzzy.
 729         """
 730         if query is None:
 731             query = ''
 732         q = self.index.Q()
 733         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 734
 735         return q
 736
 737     def search_phrase(self, searched, field='text', book=False,
 738                       filters=None,
 739                       snippets=False):
 740         if filters is None:
 741             filters = []
 742         if book:
 743             filters.append(self.index.Q(is_book=True))
 744
 745         q = self.index.query(**{field: searched})
 746         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 747         res = q.execute()
 748         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 749
 750     def search_some(self, searched, fields, book=True,
 751                     filters=None, snippets=True, query_terms=None):
 752         assert isinstance(fields, list)
 753         if filters is None:
 754             filters = []
 755         if book:
 756             filters.append(self.index.Q(is_book=True))
 757
 758         query = self.index.Q()
 759
 760         for fld in fields:
 761             query = self.index.Q(query | self.make_term_query(searched, fld))
 762
 763         query = self.index.query(query)
 764         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 765         res = query.execute()
 766         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 767
 768     def search_everywhere(self, searched, query_terms=None):
 769         """
 770         Tries to use search terms to match different fields of book (or its parts).
 771         E.g. one word can be an author survey, another be a part of the title, and the rest
 772         are some words from third chapter.
 773         """
 774         books = []
 775         # content only query : themes x content
 776         q = self.make_term_query(searched, 'text')
 777         q_themes = self.make_term_query(searched, 'themes_pl')
 778
 779         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 780         res = query.execute()
 781
 782         for found in res:
 783             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 784
 785         # query themes/content x author/title/tags
 786         in_content = self.index.Q()
 787         in_meta = self.index.Q()
 788
 789         for fld in ['themes_pl', 'text']:
 790             in_content |= self.make_term_query(searched, field=fld)
 791
 792         for fld in ['tags', 'authors', 'title']:
 793             in_meta |= self.make_term_query(searched, field=fld)
 794
 795         q = in_content & in_meta
 796         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 797
 798         for found in res:
 799             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 800
 801         return books
 802
 803     def get_snippets(self, searchresult, query, field='text', num=1):
 804         """
 805         Returns a snippet for found scoreDoc.
 806         """
 807         maxnum = len(searchresult)
 808         if num is None or num < 0 or num > maxnum:
 809             num = maxnum
 810         book_id = searchresult.book_id
 811         revision = searchresult.snippet_revision()
 812         snippets = Snippets(book_id, revision=revision)
 813         snips = [None] * maxnum
 814         try:
 815             snippets.open()
 816             idx = 0
 817             while idx < maxnum and num > 0:
 818                 position, length = searchresult.snippet_pos(idx)
 819                 if position is None or length is None:
 820                     continue
 821                 text = snippets.get((int(position),
 822                                      int(length)))
 823                 snip = self.index.highlight(text=text, field=field, q=query)
 824                 snips[idx] = snip
 825                 if snip:
 826                     num -= 1
 827                 idx += 1
 828
 829         except IOError, e:
 830             book = catalogue.models.Book.objects.filter(id=book_id)
 831             if not book:
 832                 log.error("Book does not exist for book id = %d" % book_id)
 833             elif not book.get().children.exists():
 834                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 835             return []
 836         finally:
 837             snippets.close()
 838
 839             # remove verse end markers..
 840         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 841
 842         searchresult.snippets = snips
 843
 844         return snips
 845
 846     def hint_tags(self, query, pdcounter=True, prefix=True):
 847         """
 848         Return auto-complete hints for tags
 849         using prefix search.
 850         """
 851         q = self.index.Q()
 852         query = query.strip()
 853         for field in ['tag_name', 'tag_name_pl']:
 854             if prefix:
 855                 q |= self.index.Q(**{field: query + "*"})
 856             else:
 857                 q |= self.make_term_query(query, field=field)
 858         qu = self.index.query(q)
 859
 860         return self.search_tags(qu, pdcounter=pdcounter)
 861
 862     def search_tags(self, query, filters=None, pdcounter=False):
 863         """
 864         Search for Tag objects using query.
 865         """
 866         if not filters:
 867             filters = []
 868         if not pdcounter:
 869             filters.append(~self.index.Q(is_pdcounter=True))
 870         res = self.apply_filters(query, filters).execute()
 871
 872         tags = []
 873         pd_tags = []
 874
 875         for doc in res:
 876             is_pdcounter = doc.get('is_pdcounter', False)
 877             category = doc.get('tag_category')
 878             try:
 879                 if is_pdcounter:
 880                     if category == 'pd_author':
 881                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 882                     elif category == 'pd_book':
 883                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 884                         tag.category = 'pd_book'  # make it look more lik a tag.
 885                     else:
 886                         # WTF
 887                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
 888                             int(doc.get('tag_id')), category)).encode('utf-8')
 889                     pd_tags.append(tag)
 890                 else:
 891                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 892                     tags.append(tag)
 893
 894             except catalogue.models.Tag.DoesNotExist:
 895                 pass
 896             except PDCounterAuthor.DoesNotExist:
 897                 pass
 898             except PDCounterBook.DoesNotExist:
 899                 pass
 900
 901         tags_slugs = set(map(lambda t: t.slug, tags))
 902         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
 903
 904         log.debug('search_tags: %s' % tags)
 905
 906         return tags
 907
 908     def hint_books(self, query, prefix=True):
 909         """
 910         Returns auto-complete hints for book titles
 911         Because we do not index 'pseudo' title-tags.
 912         Prefix search.
 913         """
 914         q = self.index.Q()
 915         query = query.strip()
 916         if prefix:
 917             q |= self.index.Q(title=query + "*")
 918         else:
 919             q |= self.make_term_query(query, field='title')
 920         qu = self.index.query(q)
 921         only_books = self.index.Q(is_book=True)
 922         return self.search_books(qu, [only_books])
 923
 924     def search_books(self, query, filters=None, max_results=10):
 925         """
 926         Searches for Book objects using query
 927         """
 928         bks = []
 929         bks_found = set()
 930         query = query.query(is_book=True)
 931         res = self.apply_filters(query, filters).field_limit(['book_id'])
 932         for r in res:
 933             try:
 934                 bid = r['book_id']
 935                 if bid not in bks_found:
 936                     bks.append(catalogue.models.Book.objects.get(id=bid))
 937                     bks_found.add(bid)
 938             except catalogue.models.Book.DoesNotExist:
 939                 pass
 940         return bks
 941
 942     @staticmethod
 943     def apply_filters(query, filters):
 944         """
 945         Apply filters to a query
 946         """
 947         if filters is None:
 948             filters = []
 949         filters = filter(lambda x: x is not None, filters)
 950         for f in filters:
 951             query = query.query(f)
 952         return query
 953
 954
 955 if getattr(settings, 'SEARCH_MOCK', False):
 956     from .mock_search import Search