src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 import errno
  10 from librarian import dcparser
  11 from librarian.parser import WLDocument
  12 from lxml import etree
  13 import catalogue.models
  14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  15 from itertools import chain
  16 import traceback
  17 import logging
  18 log = logging.getLogger('search')
  19 import sunburnt
  20 import custom
  21 import operator
  22
  23 log = logging.getLogger('search')
  24
  25 class SolrIndex(object):
  26     def __init__(self, mode=None):
  27         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  28
  29
  30 class Snippets(object):
  31     """
  32     This class manages snippet files for indexed object (book)
  33     the snippets are concatenated together, and their positions and
  34     lengths are kept in lucene index fields.
  35     """
  36     SNIPPET_DIR = "snippets"
  37
  38     def __init__(self, book_id, revision=None):
  39         try:
  40             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  41         except OSError as exc:
  42             if exc.errno == errno.EEXIST:
  43                 pass
  44             else: raise
  45         self.book_id = book_id
  46         self.revision = revision
  47         self.file = None
  48
  49     @property
  50     def path(self):
  51         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  52         else: fn = "%d" % self.book_id
  53
  54         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  55
  56     def open(self, mode='r'):
  57         """
  58         Open the snippet file. Call .close() afterwards.
  59         """
  60         if not 'b' in mode:
  61             mode += 'b'
  62
  63         if 'w' in mode:
  64             if os.path.exists(self.path):
  65                 self.revision = 1
  66                 while True:
  67                     if not os.path.exists(self.path):
  68                         break
  69                     self.revision += 1
  70
  71         self.file = open(self.path, mode)
  72         self.position = 0
  73         return self
  74
  75     def add(self, snippet):
  76         """
  77         Append a snippet (unicode) to the snippet file.
  78         Return a (position, length) tuple
  79         """
  80         txt = snippet.encode('utf-8')
  81         l = len(txt)
  82         self.file.write(txt)
  83         pos = (self.position, l)
  84         self.position += l
  85         return pos
  86
  87     def get(self, pos):
  88         """
  89         Given a tuple of (position, length) return an unicode
  90         of the snippet stored there.
  91         """
  92         self.file.seek(pos[0], 0)
  93         txt = self.file.read(pos[1]).decode('utf-8')
  94         return txt
  95
  96     def close(self):
  97         """Close snippet file"""
  98         if self.file:
  99             self.file.close()
 100
 101     def remove(self):
 102         self.revision = None
 103         try:
 104             os.unlink(self.path)
 105             self.revision = 0
 106             while True:
 107                 self.revision += 1
 108                 os.unlink(self.path)
 109         except OSError:
 110             pass
 111
 112
 113 class Index(SolrIndex):
 114     """
 115     Class indexing books.
 116     """
 117     def __init__(self):
 118         super(Index, self).__init__(mode='rw')
 119
 120     def delete_query(self, *queries):
 121         """
 122         index.delete(queries=...) doesn't work, so let's reimplement it
 123         using deletion of list of uids.
 124         """
 125         uids = set()
 126         for q in queries:
 127             if isinstance(q, sunburnt.search.LuceneQuery):
 128                 q = self.index.query(q)
 129             q.field_limiter.update(['uid'])
 130             st = 0
 131             rows = 100
 132             while True:
 133                 ids = q.paginate(start=st, rows=rows).execute()
 134                 if not len(ids):
 135                     break
 136                 for res in ids:
 137                     uids.add(res['uid'])
 138                 st += rows
 139         if uids:
 140             self.index.delete(uids)
 141             return True
 142         else:
 143             return False
 144
 145     def index_tags(self, *tags, **kw):
 146         """
 147         Re-index global tag list.
 148         Removes all tags from index, then index them again.
 149         Indexed fields include: id, name (with and without polish stems), category
 150         """
 151         log.debug("Indexing tags")
 152         remove_only = kw.get('remove_only', False)
 153         # first, remove tags from index.
 154         if tags:
 155             tag_qs = []
 156             for tag in tags:
 157                 q_id = self.index.Q(tag_id=tag.id)
 158
 159                 if isinstance(tag, PDCounterAuthor):
 160                     q_cat = self.index.Q(tag_category='pd_author')
 161                 elif isinstance(tag, PDCounterBook):
 162                     q_cat = self.index.Q(tag_category='pd_book')
 163                 else:
 164                     q_cat = self.index.Q(tag_category=tag.category)
 165
 166                 q_id_cat = self.index.Q(q_id & q_cat)
 167                 tag_qs.append(q_id_cat)
 168             self.delete_query(*tag_qs)
 169         else:  # all
 170             q = self.index.Q(tag_id__any=True)
 171             self.delete_query(q)
 172
 173         if not remove_only:
 174             # then add them [all or just one passed]
 175             if not tags:
 176                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 177                     PDCounterAuthor.objects.all(), \
 178                     PDCounterBook.objects.all())
 179
 180             for tag in tags:
 181                 if isinstance(tag, PDCounterAuthor):
 182                     doc = {
 183                         "tag_id": int(tag.id),
 184                         "tag_name": tag.name,
 185                         "tag_name_pl": tag.name,
 186                         "tag_category": 'pd_author',
 187                         "is_pdcounter": True,
 188                         "uid": "tag%d_pd_a" % tag.id
 189                         }
 190                 elif isinstance(tag, PDCounterBook):
 191                     doc = {
 192                         "tag_id": int(tag.id),
 193                         "tag_name": tag.title,
 194                         "tag_name_pl": tag.title,
 195                         "tag_category": 'pd_book',
 196                         "is_pdcounter": True,
 197                         "uid": "tag%d_pd_b" % tag.id
 198                         }
 199                 else:
 200                     doc = {
 201                         "tag_id": int(tag.id),
 202                         "tag_name": tag.name,
 203                         "tag_name_pl": tag.name,
 204                         "tag_category": tag.category,
 205                         "is_pdcounter": False,
 206                         "uid": "tag%d" % tag.id
 207                         }
 208                 self.index.add(doc)
 209
 210     def create_book_doc(self, book):
 211         """
 212         Create a lucene document referring book id.
 213         """
 214         doc = {
 215             'book_id': int(book.id),
 216             }
 217         if book.parent is not None:
 218             doc["parent_id"] = int(book.parent.id)
 219         return doc
 220
 221     def remove_book(self, book_or_id, remove_snippets=True):
 222         """Removes a book from search index.
 223         book - Book instance."""
 224         if isinstance(book_or_id, catalogue.models.Book):
 225             book_id = book_or_id.id
 226         else:
 227             book_id = book_or_id
 228
 229         self.delete_query(self.index.Q(book_id=book_id))
 230
 231         if remove_snippets:
 232             snippets = Snippets(book_id)
 233             snippets.remove()
 234
 235     def index_book(self, book, book_info=None, overwrite=True):
 236         """
 237         Indexes the book.
 238         Creates a lucene document for extracted metadata
 239         and calls self.index_content() to index the contents of the book.
 240         """
 241         if overwrite:
 242             # we don't remove snippets, since they might be still needed by
 243             # threads using not reopened index
 244             self.remove_book(book, remove_snippets=False)
 245
 246         book_doc = self.create_book_doc(book)
 247         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 248         # let's not index it - it's only used for extracting publish date
 249         if 'source_name' in meta_fields:
 250             del meta_fields['source_name']
 251
 252         for n, f in meta_fields.items():
 253             book_doc[n] = f
 254
 255         book_doc['uid'] = "book%s" % book_doc['book_id']
 256         self.index.add(book_doc)
 257         del book_doc
 258         book_fields = {
 259             'title': meta_fields['title'],
 260             'authors': meta_fields['authors'],
 261             'published_date': meta_fields['published_date']
 262             }
 263
 264         if 'translators' in meta_fields:
 265             book_fields['translators'] = meta_fields['translators']
 266
 267         self.index_content(book, book_fields=book_fields)
 268
 269     master_tags = [
 270         'opowiadanie',
 271         'powiesc',
 272         'dramat_wierszowany_l',
 273         'dramat_wierszowany_lp',
 274         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 275         'wywiad',
 276         ]
 277
 278     ignore_content_tags = [
 279         'uwaga', 'extra',
 280         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 281         'didaskalia',
 282         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 283         ]
 284
 285     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 286
 287     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 288
 289     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 290
 291     def extract_metadata(self, book, book_info=None, dc_only=None):
 292         """
 293         Extract metadata from book and returns a map of fields keyed by fieldname
 294         """
 295         fields = {}
 296
 297         if book_info is None:
 298             book_info = dcparser.parse(open(book.xml_file.path))
 299
 300         fields['slug'] = book.slug
 301         fields['tags'] = [t.name  for t in book.tags]
 302         fields['is_book'] = True
 303
 304         # validator, name
 305         for field in dcparser.BookInfo.FIELDS:
 306             if dc_only and field.name not in dc_only:
 307                 continue
 308             if hasattr(book_info, field.name):
 309                 if not getattr(book_info, field.name):
 310                     continue
 311                 # since no type information is available, we use validator
 312                 type_indicator = field.validator
 313                 if type_indicator == dcparser.as_unicode:
 314                     s = getattr(book_info, field.name)
 315                     if field.multiple:
 316                         s = ', '.join(s)
 317                     fields[field.name] = s
 318                 elif type_indicator == dcparser.as_person:
 319                     p = getattr(book_info, field.name)
 320                     if isinstance(p, dcparser.Person):
 321                         persons = unicode(p)
 322                     else:
 323                         persons = ', '.join(map(unicode, p))
 324                     fields[field.name] = persons
 325                 elif type_indicator == dcparser.as_date:
 326                     dt = getattr(book_info, field.name)
 327                     fields[field.name] = dt
 328
 329         # get published date
 330         pd = None
 331         if hasattr(book_info, 'source_name') and book_info.source_name:
 332             match = self.published_date_re.search(book_info.source_name)
 333             if match is not None:
 334                 pd = str(match.groups()[0])
 335         if not pd: pd = ""
 336         fields["published_date"] = pd
 337
 338         return fields
 339
 340     # def add_gaps(self, fields, fieldname):
 341     #     """
 342     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 343     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 344     #     """
 345     #     def gap():
 346     #         while True:
 347     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 348     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 349
 350     def get_master(self, root):
 351         """
 352         Returns the first master tag from an etree.
 353         """
 354         for master in root.iter():
 355             if master.tag in self.master_tags:
 356                 return master
 357
 358     def index_content(self, book, book_fields={}):
 359         """
 360         Walks the book XML and extract content from it.
 361         Adds parts for each header tag and for each fragment.
 362         """
 363         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 364         root = wld.edoc.getroot()
 365
 366         master = self.get_master(root)
 367         if master is None:
 368             return []
 369
 370         def walker(node, ignore_tags=[]):
 371
 372             if node.tag not in ignore_tags:
 373                 yield node, None, None
 374                 if node.text is not None:
 375                     yield None, node.text, None
 376                 for child in list(node):
 377                     for b, t, e in walker(child):
 378                         yield b, t, e
 379                 yield None, None, node
 380
 381             if node.tail is not None:
 382                 yield None, node.tail, None
 383             return
 384
 385         def fix_format(text):
 386             #            separator = [u" ", u"\t", u".", u";", u","]
 387             if isinstance(text, list):
 388                 # need to join it first
 389                 text = filter(lambda s: s is not None, content)
 390                 text = u' '.join(text)
 391                 # for i in range(len(text)):
 392                 #     if i > 0:
 393                 #         if text[i][0] not in separator\
 394                 #             and text[i - 1][-1] not in separator:
 395                 #          text.insert(i, u" ")
 396
 397             return re.sub("(?m)/$", "", text)
 398
 399         def add_part(snippets, **fields):
 400             doc = self.create_book_doc(book)
 401             for n, v in book_fields.items():
 402                 doc[n] = v
 403
 404             doc['header_index'] = fields["header_index"]
 405             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 406             doc['header_type'] = fields['header_type']
 407
 408             doc['text'] = fields['text']
 409
 410             # snippets
 411             snip_pos = snippets.add(fields["text"])
 412
 413             doc['snippets_position'] = snip_pos[0]
 414             doc['snippets_length'] = snip_pos[1]
 415             if snippets.revision:
 416                 doc["snippets_revision"] = snippets.revision
 417
 418             if 'fragment_anchor' in fields:
 419                 doc["fragment_anchor"] = fields['fragment_anchor']
 420
 421             if 'themes' in fields:
 422                 doc['themes'] = fields['themes']
 423             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 424                                          doc['header_span'],
 425                                          doc.get('fragment_anchor', ''))
 426             return doc
 427
 428         def give_me_utf8(s):
 429             if isinstance(s, unicode):
 430                 return s.encode('utf-8')
 431             else:
 432                 return s
 433
 434         fragments = {}
 435         snippets = Snippets(book.id).open('w')
 436         try:
 437             for header, position in zip(list(master), range(len(master))):
 438
 439                 if header.tag in self.skip_header_tags:
 440                     continue
 441                 if header.tag is etree.Comment:
 442                     continue
 443
 444                 # section content
 445                 content = []
 446                 footnote = []
 447
 448                 def all_content(text):
 449                     for frag in fragments.values():
 450                         frag['text'].append(text)
 451                     content.append(text)
 452                 handle_text = [all_content]
 453
 454                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 455                     # handle footnotes
 456                     if start is not None and start.tag in self.footnote_tags:
 457                         footnote = []
 458
 459                         def collect_footnote(t):
 460                             footnote.append(t)
 461
 462                         handle_text.append(collect_footnote)
 463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 464                         handle_text.pop()
 465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 466                                        text=u''.join(footnote),
 467                                        is_footnote=True)
 468                         self.index.add(doc)
 469                         footnote = []
 470
 471                     # handle fragments and themes.
 472                     if start is not None and start.tag == 'begin':
 473                         fid = start.attrib['id'][1:]
 474                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if frag['themes'] == []:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        text=fix_format(frag['text']),
 500                                        themes=frag['themes'])
 501                         self.index.add(doc)
 502
 503                         # Collect content.
 504
 505                     if text is not None and handle_text is not []:
 506                         hdl = handle_text[-1]
 507                         if hdl is not None:
 508                             hdl(text)
 509
 510                         # in the end, add a section text.
 511                 doc = add_part(snippets, header_index=position,
 512                                header_type=header.tag, text=fix_format(content))
 513
 514                 self.index.add(doc)
 515
 516         finally:
 517             snippets.close()
 518
 519
 520 class SearchResult(object):
 521     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 522         #        self.search = search
 523         self.boost = 1.0
 524         self._hits = []
 525         self._processed_hits = None  # processed hits
 526         self.snippets = []
 527         self.query_terms = query_terms
 528
 529         if 'score' in doc:
 530             self._score = doc['score']
 531         else:
 532             self._score = 0
 533
 534         self.book_id = int(doc["book_id"])
 535
 536         try:
 537             self.published_date = int(doc.get("published_date"))
 538         except ValueError:
 539             self.published_date = 0
 540
 541         # content hits
 542         header_type = doc.get("header_type", None)
 543         # we have a content hit in some header of fragment
 544         if header_type is not None:
 545             sec = (header_type, int(doc["header_index"]))
 546             header_span = doc['header_span']
 547             header_span = header_span is not None and int(header_span) or 1
 548             fragment = doc.get("fragment_anchor", None)
 549             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 550             snippets_rev = doc.get('snippets_revision', None)
 551
 552             hit = (sec + (header_span,), fragment, self._score, {
 553                 'how_found': how_found,
 554                 'snippets_pos': snippets_pos,
 555                 'snippets_revision': snippets_rev,
 556                 'themes': doc.get('themes', []),
 557                 'themes_pl': doc.get('themes_pl', [])
 558                 })
 559
 560             self._hits.append(hit)
 561
 562     def __unicode__(self):
 563         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 564             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 565
 566     def __str__(self):
 567         return unicode(self).encode('utf-8')
 568
 569     @property
 570     def score(self):
 571         return self._score * self.boost
 572
 573     def merge(self, other):
 574         if self.book_id != other.book_id:
 575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 576         self._hits += other._hits
 577         if other.score > self.score:
 578             self._score = other._score
 579         return self
 580
 581     def get_book(self):
 582         if hasattr(self, '_book'):
 583             return self._book
 584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 585         return self._book
 586
 587     book = property(get_book)
 588
 589     POSITION = 0
 590     FRAGMENT = 1
 591     POSITION_INDEX = 1
 592     POSITION_SPAN = 2
 593     SCORE = 2
 594     OTHER = 3
 595
 596     @property
 597     def hits(self):
 598         if self._processed_hits is not None:
 599             return self._processed_hits
 600
 601         # to sections and fragments
 602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 603
 604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 605
 606         # sections not covered by fragments
 607         sect = filter(lambda s: 0 == len(filter(
 608             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 609             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 610             frags)), sect)
 611
 612         hits = []
 613
 614         def remove_duplicates(lst, keyfn, compare):
 615             els = {}
 616             for e in lst:
 617                 eif = keyfn(e)
 618                 if eif in els:
 619                     if compare(els[eif], e) >= 1:
 620                         continue
 621                 els[eif] = e
 622             return els.values()
 623
 624         # remove fragments with duplicated fid's and duplicated snippets
 625         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 626         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 627         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 628
 629         # remove duplicate sections
 630         sections = {}
 631
 632         for s in sect:
 633             si = s[self.POSITION][self.POSITION_INDEX]
 634             # skip existing
 635             if si in sections:
 636                 if sections[si]['score'] >= s[self.SCORE]:
 637                     continue
 638
 639             m = {'score': s[self.SCORE],
 640                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 641                  }
 642             m.update(s[self.OTHER])
 643             sections[si] = m
 644
 645         hits = sections.values()
 646
 647         for f in frags:
 648             try:
 649                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 650             except catalogue.models.Fragment.DoesNotExist:
 651                 # stale index
 652                 continue
 653             # Figure out if we were searching for a token matching some word in theme name.
 654             themes = frag.tags.filter(category='theme')
 655             themes_hit = set()
 656             if self.query_terms is not None:
 657                 for i in range(0, len(f[self.OTHER]['themes'])):
 658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 659                     tms = map(unicode.lower, tms)
 660                     for qt in self.query_terms:
 661                         if qt in tms:
 662                             themes_hit.add(f[self.OTHER]['themes'][i])
 663                             break
 664
 665             def theme_by_name(n):
 666                 th = filter(lambda t: t.name == n, themes)
 667                 if th:
 668                     return th[0]
 669                 else:
 670                     return None
 671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 672
 673             m = {'score': f[self.SCORE],
 674                  'fragment': frag,
 675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 676                  'themes': themes,
 677                  'themes_hit': themes_hit
 678                  }
 679             m.update(f[self.OTHER])
 680             hits.append(m)
 681
 682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 683
 684         self._processed_hits = hits
 685
 686         return hits
 687
 688     @staticmethod
 689     def aggregate(*result_lists):
 690         books = {}
 691         for rl in result_lists:
 692             for r in rl:
 693                 if r.book_id in books:
 694                     books[r.book_id].merge(r)
 695                 else:
 696                     books[r.book_id] = r
 697         return books.values()
 698
 699     def __cmp__(self, other):
 700         c = cmp(self.score, other.score)
 701         if c == 0:
 702             # this is inverted, because earlier date is better
 703             return cmp(other.published_date, self.published_date)
 704         else:
 705             return c
 706
 707     def __len__(self):
 708         return len(self.hits)
 709
 710     def snippet_pos(self, idx=0):
 711         return self.hits[idx]['snippets_pos']
 712
 713     def snippet_revision(self, idx=0):
 714         try:
 715             return self.hits[idx]['snippets_revision']
 716         except:
 717             return None
 718
 719
 720 class Search(SolrIndex):
 721     """
 722     Search facilities.
 723     """
 724     def __init__(self, default_field="text"):
 725         super(Search, self).__init__(mode='r')
 726
 727
 728     def make_term_query(self, query, field='text', modal=operator.or_):
 729         """
 730         Returns term queries joined by boolean query.
 731         modal - applies to boolean query
 732         fuzzy - should the query by fuzzy.
 733         """
 734         if query is None: query = ''
 735         q = self.index.Q()
 736         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 737                         query.split(r" ")), q)
 738
 739         return q
 740
 741     def search_phrase(self, searched, field='text', book=False,
 742                       filters=None,
 743                       snippets=False):
 744         if filters is None: filters = []
 745         if book: filters.append(self.index.Q(is_book=True))
 746
 747         q = self.index.query(**{field: searched})
 748         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 749         res = q.execute()
 750         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 751
 752     def search_some(self, searched, fields, book=True,
 753                     filters=None, snippets=True, query_terms=None):
 754         assert isinstance(fields, list)
 755         if filters is None: filters = []
 756         if book: filters.append(self.index.Q(is_book=True))
 757
 758         query = self.index.Q()
 759
 760         for fld in fields:
 761             query = self.index.Q(query | self.make_term_query(searched, fld))
 762
 763         query = self.index.query(query)
 764         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 765         res = query.execute()
 766         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 767
 768
 769     def search_everywhere(self, searched, query_terms=None):
 770         """
 771         Tries to use search terms to match different fields of book (or its parts).
 772         E.g. one word can be an author survey, another be a part of the title, and the rest
 773         are some words from third chapter.
 774         """
 775         books = []
 776         # content only query : themes x content
 777         q = self.make_term_query(searched, 'text')
 778         q_themes = self.make_term_query(searched, 'themes_pl')
 779
 780         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 781         res = query.execute()
 782
 783         for found in res:
 784             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 785
 786         # query themes/content x author/title/tags
 787         in_content = self.index.Q()
 788         in_meta = self.index.Q()
 789
 790         for fld in ['themes_pl', 'text']:
 791             in_content |= self.make_term_query(searched, field=fld)
 792
 793         for fld in ['tags', 'authors', 'title']:
 794             in_meta |= self.make_term_query(searched, field=fld)
 795
 796         q = in_content & in_meta
 797         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 798
 799         for found in res:
 800             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 801
 802         return books
 803
 804     def get_snippets(self, searchresult, query, field='text', num=1):
 805         """
 806         Returns a snippet for found scoreDoc.
 807         """
 808         maxnum = len(searchresult)
 809         if num is None or num < 0 or num > maxnum:
 810             num = maxnum
 811         book_id = searchresult.book_id
 812         revision = searchresult.snippet_revision()
 813         snippets = Snippets(book_id, revision=revision)
 814         snips = [None] * maxnum
 815         try:
 816             snippets.open()
 817             idx = 0
 818             while idx < maxnum and num > 0:
 819                 position, length = searchresult.snippet_pos(idx)
 820                 if position is None or length is None:
 821                     continue
 822                 text = snippets.get((int(position),
 823                                      int(length)))
 824                 snip = self.index.highlight(text=text, field=field, q=query)
 825                 snips[idx] = snip
 826                 if snip:
 827                     num -= 1
 828                 idx += 1
 829
 830         except IOError, e:
 831             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 832             return []
 833         finally:
 834             snippets.close()
 835
 836             # remove verse end markers..
 837         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 838
 839         searchresult.snippets = snips
 840
 841         return snips
 842
 843     def hint_tags(self, query, pdcounter=True, prefix=True):
 844         """
 845         Return auto-complete hints for tags
 846         using prefix search.
 847         """
 848         q = self.index.Q()
 849         query = query.strip()
 850         for field in ['tag_name', 'tag_name_pl']:
 851             if prefix:
 852                 q |= self.index.Q(**{field: query + "*"})
 853             else:
 854                 q |= self.make_term_query(query, field=field)
 855         qu = self.index.query(q)
 856
 857         return self.search_tags(qu, pdcounter=pdcounter)
 858
 859     def search_tags(self, query, filters=None, pdcounter=False):
 860         """
 861         Search for Tag objects using query.
 862         """
 863         if not filters: filters = []
 864         if not pdcounter:
 865             filters.append(~self.index.Q(is_pdcounter=True))
 866         res = self.apply_filters(query, filters).execute()
 867
 868         tags = []
 869         pd_tags = []
 870
 871         for doc in res:
 872             is_pdcounter = doc.get('is_pdcounter', False)
 873             category = doc.get('tag_category')
 874             try:
 875                 if is_pdcounter == True:
 876                     if category == 'pd_author':
 877                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 878                     elif category == 'pd_book':
 879                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 880                         tag.category = 'pd_book'  # make it look more lik a tag.
 881                     else:
 882                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
 883                     pd_tags.append(tag)
 884                 else:
 885                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 886                     tags.append(tag)
 887
 888             except catalogue.models.Tag.DoesNotExist: pass
 889             except PDCounterAuthor.DoesNotExist: pass
 890             except PDCounterBook.DoesNotExist: pass
 891
 892         tags_slugs = set(map(lambda t: t.slug, tags))
 893         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
 894
 895         log.debug('search_tags: %s' % tags)
 896
 897         return tags
 898
 899     def hint_books(self, query, prefix=True):
 900         """
 901         Returns auto-complete hints for book titles
 902         Because we do not index 'pseudo' title-tags.
 903         Prefix search.
 904         """
 905         q = self.index.Q()
 906         query = query.strip()
 907         if prefix:
 908             q |= self.index.Q(title=query + "*")
 909         else:
 910             q |= self.make_term_query(query, field='title')
 911         qu = self.index.query(q)
 912         only_books = self.index.Q(is_book=True)
 913         return self.search_books(qu, [only_books])
 914
 915     def search_books(self, query, filters=None, max_results=10):
 916         """
 917         Searches for Book objects using query
 918         """
 919         bks = []
 920         bks_found = set()
 921         query = query.query(is_book=True)
 922         res = self.apply_filters(query, filters).field_limit(['book_id'])
 923         for r in res:
 924             try:
 925                 bid = r['book_id']
 926                 if not bid in bks_found:
 927                     bks.append(catalogue.models.Book.objects.get(id=bid))
 928                     bks_found.add(bid)
 929             except catalogue.models.Book.DoesNotExist: pass
 930         return bks
 931
 932
 933     @staticmethod
 934     def apply_filters(query, filters):
 935         """
 936         Apply filters to a query
 937         """
 938         if filters is None: filters = []
 939         filters = filter(lambda x: x is not None, filters)
 940         for f in filters:
 941             query = query.query(f)
 942         return query
 943
 944
 945 if getattr(settings, 'SEARCH_MOCK', False):
 946     from .mock_search import Search