src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  14 from itertools import chain
  15 import sunburnt
  16 import custom
  17 import operator
  18 import logging
  19 from wolnelektury.utils import makedirs
  20
  21 log = logging.getLogger('search')
  22
  23
  24 class SolrIndex(object):
  25     def __init__(self, mode=None):
  26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  27
  28
  29 class Snippets(object):
  30     """
  31     This class manages snippet files for indexed object (book)
  32     the snippets are concatenated together, and their positions and
  33     lengths are kept in lucene index fields.
  34     """
  35     SNIPPET_DIR = "snippets"
  36
  37     def __init__(self, book_id, revision=None):
  38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         self.book_id = book_id
  40         self.revision = revision
  41         self.file = None
  42         self.position = None
  43
  44     @property
  45     def path(self):
  46         if self.revision:
  47             fn = "%d.%d" % (self.book_id, self.revision)
  48         else:
  49             fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if 'b' not in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         if self.file:
  96             self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         log.debug("Indexing tags")
 149         remove_only = kw.get('remove_only', False)
 150         # first, remove tags from index.
 151         if tags:
 152             tag_qs = []
 153             for tag in tags:
 154                 q_id = self.index.Q(tag_id=tag.id)
 155
 156                 if isinstance(tag, PDCounterAuthor):
 157                     q_cat = self.index.Q(tag_category='pd_author')
 158                 elif isinstance(tag, PDCounterBook):
 159                     q_cat = self.index.Q(tag_category='pd_book')
 160                 else:
 161                     q_cat = self.index.Q(tag_category=tag.category)
 162
 163                 q_id_cat = self.index.Q(q_id & q_cat)
 164                 tag_qs.append(q_id_cat)
 165             self.delete_query(*tag_qs)
 166         else:  # all
 167             q = self.index.Q(tag_id__any=True)
 168             self.delete_query(q)
 169
 170         if not remove_only:
 171             # then add them [all or just one passed]
 172             if not tags:
 173                 tags = chain(
 174                     catalogue.models.Tag.objects.exclude(category='set'),
 175                     PDCounterAuthor.objects.all(),
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {'book_id': int(book.id)}
 213         if book.parent is not None:
 214             doc['parent_id'] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 244         # let's not index it - it's only used for extracting publish date
 245         if 'source_name' in meta_fields:
 246             del meta_fields['source_name']
 247
 248         for n, f in meta_fields.items():
 249             book_doc[n] = f
 250
 251         book_doc['uid'] = "book%s" % book_doc['book_id']
 252         self.index.add(book_doc)
 253         del book_doc
 254         book_fields = {
 255             'title': meta_fields['title'],
 256             'authors': meta_fields['authors'],
 257             'published_date': meta_fields['published_date']
 258             }
 259
 260         if 'translators' in meta_fields:
 261             book_fields['translators'] = meta_fields['translators']
 262
 263         self.index_content(book, book_fields=book_fields)
 264
 265     master_tags = [
 266         'opowiadanie',
 267         'powiesc',
 268         'dramat_wierszowany_l',
 269         'dramat_wierszowany_lp',
 270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 271         'wywiad',
 272         ]
 273
 274     ignore_content_tags = [
 275         'uwaga', 'extra', 'nota_red',
 276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 277         'didaskalia',
 278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 279         ]
 280
 281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 282
 283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 284                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 285
 286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 287
 288     def extract_metadata(self, book, book_info=None, dc_only=None):
 289         """
 290         Extract metadata from book and returns a map of fields keyed by fieldname
 291         """
 292         fields = {}
 293
 294         if book_info is None:
 295             book_info = dcparser.parse(open(book.xml_file.path))
 296
 297         fields['slug'] = book.slug
 298         fields['tags'] = [t.name for t in book.tags]
 299         fields['is_book'] = True
 300
 301         # validator, name
 302         for field in dcparser.BookInfo.FIELDS:
 303             if dc_only and field.name not in dc_only:
 304                 continue
 305             if hasattr(book_info, field.name):
 306                 if not getattr(book_info, field.name):
 307                     continue
 308                 # since no type information is available, we use validator
 309                 type_indicator = field.validator
 310                 if type_indicator == dcparser.as_unicode:
 311                     s = getattr(book_info, field.name)
 312                     if field.multiple:
 313                         s = ', '.join(s)
 314                     fields[field.name] = s
 315                 elif type_indicator == dcparser.as_person:
 316                     p = getattr(book_info, field.name)
 317                     if isinstance(p, dcparser.Person):
 318                         persons = unicode(p)
 319                     else:
 320                         persons = ', '.join(map(unicode, p))
 321                     fields[field.name] = persons
 322                 elif type_indicator == dcparser.as_date:
 323                     dt = getattr(book_info, field.name)
 324                     fields[field.name] = dt
 325
 326         # get published date
 327         pd = None
 328         if hasattr(book_info, 'source_name') and book_info.source_name:
 329             match = self.published_date_re.search(book_info.source_name)
 330             if match is not None:
 331                 pd = str(match.groups()[0])
 332         if not pd:
 333             pd = ""
 334         fields["published_date"] = pd
 335
 336         return fields
 337
 338     # def add_gaps(self, fields, fieldname):
 339     #     """
 340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 342     #     """
 343     #     def gap():
 344     #         while True:
 345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 347
 348     def get_master(self, root):
 349         """
 350         Returns the first master tag from an etree.
 351         """
 352         for master in root.iter():
 353             if master.tag in self.master_tags:
 354                 return master
 355
 356     def index_content(self, book, book_fields):
 357         """
 358         Walks the book XML and extract content from it.
 359         Adds parts for each header tag and for each fragment.
 360         """
 361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 362         root = wld.edoc.getroot()
 363
 364         master = self.get_master(root)
 365         if master is None:
 366             return []
 367
 368         def walker(node):
 369             if node.tag not in self.ignore_content_tags:
 370                 yield node, None, None
 371                 if node.text is not None:
 372                     yield None, node.text, None
 373                 for child in list(node):
 374                     for b, t, e in walker(child):
 375                         yield b, t, e
 376                 yield None, None, node
 377
 378             if node.tail is not None:
 379                 yield None, node.tail, None
 380             return
 381
 382         def fix_format(text):
 383             # separator = [u" ", u"\t", u".", u";", u","]
 384             if isinstance(text, list):
 385                 # need to join it first
 386                 text = filter(lambda s: s is not None, content)
 387                 text = u' '.join(text)
 388                 # for i in range(len(text)):
 389                 #     if i > 0:
 390                 #         if text[i][0] not in separator\
 391                 #             and text[i - 1][-1] not in separator:
 392                 #          text.insert(i, u" ")
 393
 394             return re.sub("(?m)/$", "", text)
 395
 396         def add_part(snippets, **fields):
 397             doc = self.create_book_doc(book)
 398             for n, v in book_fields.items():
 399                 doc[n] = v
 400
 401             doc['header_index'] = fields["header_index"]
 402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 403             doc['header_type'] = fields['header_type']
 404
 405             doc['text'] = fields['text']
 406
 407             # snippets
 408             snip_pos = snippets.add(fields["text"])
 409
 410             doc['snippets_position'] = snip_pos[0]
 411             doc['snippets_length'] = snip_pos[1]
 412             if snippets.revision:
 413                 doc["snippets_revision"] = snippets.revision
 414
 415             if 'fragment_anchor' in fields:
 416                 doc["fragment_anchor"] = fields['fragment_anchor']
 417
 418             if 'themes' in fields:
 419                 doc['themes'] = fields['themes']
 420             doc['uid'] = "part%s-%s-%s-%s" % (
 421                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 422             return doc
 423
 424         fragments = {}
 425         snippets = Snippets(book.id).open('w')
 426         try:
 427             for header, position in zip(list(master), range(len(master))):
 428
 429                 if header.tag in self.skip_header_tags:
 430                     continue
 431                 if header.tag is etree.Comment:
 432                     continue
 433
 434                 # section content
 435                 content = []
 436                 footnote = []
 437
 438                 def all_content(text):
 439                     for frag in fragments.values():
 440                         frag['text'].append(text)
 441                     content.append(text)
 442                 handle_text = [all_content]
 443
 444                 for start, text, end in walker(header):
 445                     # handle footnotes
 446                     if start is not None and start.tag in self.footnote_tags:
 447                         footnote = []
 448
 449                         def collect_footnote(t):
 450                             footnote.append(t)
 451
 452                         handle_text.append(collect_footnote)
 453                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 454                         handle_text.pop()
 455                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 456                                        text=u''.join(footnote),
 457                                        is_footnote=True)
 458                         self.index.add(doc)
 459                         footnote = []
 460
 461                     # handle fragments and themes.
 462                     if start is not None and start.tag == 'begin':
 463                         fid = start.attrib['id'][1:]
 464                         fragments[fid] = {
 465                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 466
 467                     # themes for this fragment
 468                     elif start is not None and start.tag == 'motyw':
 469                         fid = start.attrib['id'][1:]
 470                         handle_text.append(lambda text: None)
 471                         if start.text is not None:
 472                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 473                     elif end is not None and end.tag == 'motyw':
 474                         handle_text.pop()
 475
 476                     elif start is not None and start.tag == 'end':
 477                         fid = start.attrib['id'][1:]
 478                         if fid not in fragments:
 479                             continue  # a broken <end> node, skip it
 480                         frag = fragments[fid]
 481                         if not frag['themes']:
 482                             continue  # empty themes list.
 483                         del fragments[fid]
 484
 485                         doc = add_part(snippets,
 486                                        header_type=frag['start_header'],
 487                                        header_index=frag['start_section'],
 488                                        header_span=position - frag['start_section'] + 1,
 489                                        fragment_anchor=fid,
 490                                        text=fix_format(frag['text']),
 491                                        themes=frag['themes'])
 492                         self.index.add(doc)
 493
 494                         # Collect content.
 495
 496                     if text is not None and handle_text is not []:
 497                         hdl = handle_text[-1]
 498                         hdl(text)
 499
 500                         # in the end, add a section text.
 501                 doc = add_part(snippets, header_index=position,
 502                                header_type=header.tag, text=fix_format(content))
 503
 504                 self.index.add(doc)
 505
 506         finally:
 507             snippets.close()
 508
 509
 510 class SearchResult(object):
 511     def __init__(self, doc, how_found=None, query_terms=None):
 512         self.boost = 1.0
 513         self._hits = []
 514         self._processed_hits = None  # processed hits
 515         self.snippets = []
 516         self.query_terms = query_terms
 517         self._book = None
 518
 519         if 'score' in doc:
 520             self._score = doc['score']
 521         else:
 522             self._score = 0
 523
 524         self.book_id = int(doc["book_id"])
 525
 526         try:
 527             self.published_date = int(doc.get("published_date"))
 528         except ValueError:
 529             self.published_date = 0
 530
 531         # content hits
 532         header_type = doc.get("header_type", None)
 533         # we have a content hit in some header of fragment
 534         if header_type is not None:
 535             sec = (header_type, int(doc["header_index"]))
 536             header_span = doc['header_span']
 537             header_span = header_span is not None and int(header_span) or 1
 538             fragment = doc.get("fragment_anchor", None)
 539             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 540             snippets_rev = doc.get('snippets_revision', None)
 541
 542             hit = (sec + (header_span,), fragment, self._score, {
 543                 'how_found': how_found,
 544                 'snippets_pos': snippets_pos,
 545                 'snippets_revision': snippets_rev,
 546                 'themes': doc.get('themes', []),
 547                 'themes_pl': doc.get('themes_pl', [])
 548                 })
 549
 550             self._hits.append(hit)
 551
 552     def __unicode__(self):
 553         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 554             (self.book_id, len(self._hits),
 555              len(self._processed_hits) if self._processed_hits else -1,
 556              self._score, len(self.snippets))
 557
 558     def __str__(self):
 559         return unicode(self).encode('utf-8')
 560
 561     @property
 562     def score(self):
 563         return self._score * self.boost
 564
 565     def merge(self, other):
 566         if self.book_id != other.book_id:
 567             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 568         self._hits += other._hits
 569         if other.score > self.score:
 570             self._score = other._score
 571         return self
 572
 573     def get_book(self):
 574         if self._book is not None:
 575             return self._book
 576         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 577         return self._book
 578
 579     book = property(get_book)
 580
 581     POSITION = 0
 582     FRAGMENT = 1
 583     POSITION_INDEX = 1
 584     POSITION_SPAN = 2
 585     SCORE = 2
 586     OTHER = 3
 587
 588     @property
 589     def hits(self):
 590         if self._processed_hits is not None:
 591             return self._processed_hits
 592
 593         # to sections and fragments
 594         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 595
 596         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 597
 598         # sections not covered by fragments
 599         sect = filter(lambda s: 0 == len(filter(
 600             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 601                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 602
 603         def remove_duplicates(lst, keyfn, compare):
 604             els = {}
 605             for e in lst:
 606                 eif = keyfn(e)
 607                 if eif in els:
 608                     if compare(els[eif], e) >= 1:
 609                         continue
 610                 els[eif] = e
 611             return els.values()
 612
 613         # remove fragments with duplicated fid's and duplicated snippets
 614         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 615         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 616         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 617
 618         # remove duplicate sections
 619         sections = {}
 620
 621         for s in sect:
 622             si = s[self.POSITION][self.POSITION_INDEX]
 623             # skip existing
 624             if si in sections:
 625                 if sections[si]['score'] >= s[self.SCORE]:
 626                     continue
 627
 628             m = {'score': s[self.SCORE],
 629                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 630                  }
 631             m.update(s[self.OTHER])
 632             sections[si] = m
 633
 634         hits = sections.values()
 635
 636         for f in frags:
 637             try:
 638                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 639             except catalogue.models.Fragment.DoesNotExist:
 640                 # stale index
 641                 continue
 642             # Figure out if we were searching for a token matching some word in theme name.
 643             themes = frag.tags.filter(category='theme')
 644             themes_hit = set()
 645             if self.query_terms is not None:
 646                 for i in range(0, len(f[self.OTHER]['themes'])):
 647                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 648                     tms = map(unicode.lower, tms)
 649                     for qt in self.query_terms:
 650                         if qt in tms:
 651                             themes_hit.add(f[self.OTHER]['themes'][i])
 652                             break
 653
 654             def theme_by_name(n):
 655                 th = filter(lambda t: t.name == n, themes)
 656                 if th:
 657                     return th[0]
 658                 else:
 659                     return None
 660             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 661
 662             m = {'score': f[self.SCORE],
 663                  'fragment': frag,
 664                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 665                  'themes': themes,
 666                  'themes_hit': themes_hit
 667                  }
 668             m.update(f[self.OTHER])
 669             hits.append(m)
 670
 671         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 672
 673         self._processed_hits = hits
 674
 675         return hits
 676
 677     @staticmethod
 678     def aggregate(*result_lists):
 679         books = {}
 680         for rl in result_lists:
 681             for r in rl:
 682                 if r.book_id in books:
 683                     books[r.book_id].merge(r)
 684                 else:
 685                     books[r.book_id] = r
 686         return books.values()
 687
 688     def __cmp__(self, other):
 689         c = cmp(self.score, other.score)
 690         if c == 0:
 691             # this is inverted, because earlier date is better
 692             return cmp(other.published_date, self.published_date)
 693         else:
 694             return c
 695
 696     def __len__(self):
 697         return len(self.hits)
 698
 699     def snippet_pos(self, idx=0):
 700         return self.hits[idx]['snippets_pos']
 701
 702     def snippet_revision(self, idx=0):
 703         try:
 704             return self.hits[idx]['snippets_revision']
 705         except (IndexError, KeyError):
 706             return None
 707
 708
 709 class Search(SolrIndex):
 710     """
 711     Search facilities.
 712     """
 713     def __init__(self, default_field="text"):
 714         super(Search, self).__init__(mode='r')
 715
 716     def make_term_query(self, query, field='text', modal=operator.or_):
 717         """
 718         Returns term queries joined by boolean query.
 719         modal - applies to boolean query
 720         fuzzy - should the query by fuzzy.
 721         """
 722         if query is None:
 723             query = ''
 724         q = self.index.Q()
 725         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 726
 727         return q
 728
 729     def search_phrase(self, searched, field='text', book=False,
 730                       filters=None,
 731                       snippets=False):
 732         if filters is None:
 733             filters = []
 734         if book:
 735             filters.append(self.index.Q(is_book=True))
 736
 737         q = self.index.query(**{field: searched})
 738         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 739         res = q.paginate(rows=100).execute()
 740         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 741
 742     def search_some(self, searched, fields, book=True,
 743                     filters=None, snippets=True, query_terms=None):
 744         assert isinstance(fields, list)
 745         if filters is None:
 746             filters = []
 747         if book:
 748             filters.append(self.index.Q(is_book=True))
 749
 750         query = self.index.Q()
 751
 752         for fld in fields:
 753             query = self.index.Q(query | self.make_term_query(searched, fld))
 754
 755         query = self.index.query(query)
 756         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 757         res = query.execute()
 758         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 759
 760     def search_everywhere(self, searched, query_terms=None):
 761         """
 762         Tries to use search terms to match different fields of book (or its parts).
 763         E.g. one word can be an author survey, another be a part of the title, and the rest
 764         are some words from third chapter.
 765         """
 766         books = []
 767         # content only query : themes x content
 768         q = self.make_term_query(searched, 'text')
 769         q_themes = self.make_term_query(searched, 'themes_pl')
 770
 771         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 772         res = query.execute()
 773
 774         for found in res:
 775             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 776
 777         # query themes/content x author/title/tags
 778         in_content = self.index.Q()
 779         in_meta = self.index.Q()
 780
 781         for fld in ['themes_pl', 'text']:
 782             in_content |= self.make_term_query(searched, field=fld)
 783
 784         for fld in ['tags', 'authors', 'title']:
 785             in_meta |= self.make_term_query(searched, field=fld)
 786
 787         q = in_content & in_meta
 788         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 789
 790         for found in res:
 791             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 792
 793         return books
 794
 795     def get_snippets(self, searchresult, query, field='text', num=1):
 796         """
 797         Returns a snippet for found scoreDoc.
 798         """
 799         maxnum = len(searchresult)
 800         if num is None or num < 0 or num > maxnum:
 801             num = maxnum
 802         book_id = searchresult.book_id
 803         revision = searchresult.snippet_revision()
 804         snippets = Snippets(book_id, revision=revision)
 805         snips = [None] * maxnum
 806         try:
 807             snippets.open()
 808             idx = 0
 809             while idx < maxnum and num > 0:
 810                 position, length = searchresult.snippet_pos(idx)
 811                 if position is None or length is None:
 812                     continue
 813                 text = snippets.get((int(position),
 814                                      int(length)))
 815                 snip = self.index.highlight(text=text, field=field, q=query)
 816                 if snip not in snips:
 817                     snips[idx] = snip
 818                     if snip:
 819                         num -= 1
 820                 idx += 1
 821
 822         except IOError, e:
 823             book = catalogue.models.Book.objects.filter(id=book_id)
 824             if not book:
 825                 log.error("Book does not exist for book id = %d" % book_id)
 826             elif not book.get().children.exists():
 827                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 828             return []
 829         finally:
 830             snippets.close()
 831
 832             # remove verse end markers..
 833         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 834
 835         searchresult.snippets = snips
 836
 837         return snips
 838
 839     def hint_tags(self, query, pdcounter=True, prefix=True):
 840         """
 841         Return auto-complete hints for tags
 842         using prefix search.
 843         """
 844         q = self.index.Q()
 845         query = query.strip()
 846         for field in ['tag_name', 'tag_name_pl']:
 847             if prefix:
 848                 q |= self.index.Q(**{field: query + "*"})
 849             else:
 850                 q |= self.make_term_query(query, field=field)
 851         qu = self.index.query(q)
 852
 853         return self.search_tags(qu, pdcounter=pdcounter)
 854
 855     def search_tags(self, query, filters=None, pdcounter=False):
 856         """
 857         Search for Tag objects using query.
 858         """
 859         if not filters:
 860             filters = []
 861         if not pdcounter:
 862             filters.append(~self.index.Q(is_pdcounter=True))
 863         res = self.apply_filters(query, filters).execute()
 864
 865         tags = []
 866         pd_tags = []
 867
 868         for doc in res:
 869             is_pdcounter = doc.get('is_pdcounter', False)
 870             category = doc.get('tag_category')
 871             try:
 872                 if is_pdcounter:
 873                     if category == 'pd_author':
 874                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 875                     elif category == 'pd_book':
 876                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 877                         tag.category = 'pd_book'  # make it look more lik a tag.
 878                     else:
 879                         # WTF
 880                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
 881                             int(doc.get('tag_id')), category)).encode('utf-8')
 882                     pd_tags.append(tag)
 883                 else:
 884                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 885                     tags.append(tag)
 886
 887             except catalogue.models.Tag.DoesNotExist:
 888                 pass
 889             except PDCounterAuthor.DoesNotExist:
 890                 pass
 891             except PDCounterBook.DoesNotExist:
 892                 pass
 893
 894         tags_slugs = set(map(lambda t: t.slug, tags))
 895         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
 896
 897         log.debug('search_tags: %s' % tags)
 898
 899         return tags
 900
 901     def hint_books(self, query, prefix=True):
 902         """
 903         Returns auto-complete hints for book titles
 904         Because we do not index 'pseudo' title-tags.
 905         Prefix search.
 906         """
 907         q = self.index.Q()
 908         query = query.strip()
 909         if prefix:
 910             q |= self.index.Q(title=query + "*")
 911             q |= self.index.Q(title_orig=query + "*")
 912         else:
 913             q |= self.make_term_query(query, field='title')
 914             q |= self.make_term_query(query, field='title_orig')
 915         qu = self.index.query(q)
 916         only_books = self.index.Q(is_book=True)
 917         return self.search_books(qu, [only_books])
 918
 919     def search_books(self, query, filters=None, max_results=10):
 920         """
 921         Searches for Book objects using query
 922         """
 923         bks = []
 924         bks_found = set()
 925         query = query.query(is_book=True)
 926         res = self.apply_filters(query, filters).field_limit(['book_id'])
 927         for r in res:
 928             try:
 929                 bid = r['book_id']
 930                 if bid not in bks_found:
 931                     bks.append(catalogue.models.Book.objects.get(id=bid))
 932                     bks_found.add(bid)
 933             except catalogue.models.Book.DoesNotExist:
 934                 pass
 935         return bks
 936
 937     @staticmethod
 938     def apply_filters(query, filters):
 939         """
 940         Apply filters to a query
 941         """
 942         if filters is None:
 943             filters = []
 944         filters = filter(lambda x: x is not None, filters)
 945         for f in filters:
 946             query = query.query(f)
 947         return query
 948
 949
 950 if getattr(settings, 'SEARCH_MOCK', False):
 951     from .mock_search import Search