apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21 log = logging.getLogger('search')
  22
  23 class SolrIndex(object):
  24     def __init__(self, mode=None):
  25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  26
  27
  28 class Snippets(object):
  29     """
  30     This class manages snippet files for indexed object (book)
  31     the snippets are concatenated together, and their positions and
  32     lengths are kept in lucene index fields.
  33     """
  34     SNIPPET_DIR = "snippets"
  35
  36     def __init__(self, book_id, revision=None):
  37         try:
  38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         except OSError as exc:
  40             if exc.errno == errno.EEXIST:
  41                 pass
  42             else: raise
  43         self.book_id = book_id
  44         self.revision = revision
  45         self.file = None
  46
  47     @property
  48     def path(self):
  49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  50         else: fn = "%d" % self.book_id
  51
  52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  53
  54     def open(self, mode='r'):
  55         """
  56         Open the snippet file. Call .close() afterwards.
  57         """
  58         if not 'b' in mode:
  59             mode += 'b'
  60
  61         if 'w' in mode:
  62             if os.path.exists(self.path):
  63                 self.revision = 1
  64                 while True:
  65                     if not os.path.exists(self.path):
  66                         break
  67                     self.revision += 1
  68
  69         self.file = open(self.path, mode)
  70         self.position = 0
  71         return self
  72
  73     def add(self, snippet):
  74         """
  75         Append a snippet (unicode) to the snippet file.
  76         Return a (position, length) tuple
  77         """
  78         txt = snippet.encode('utf-8')
  79         l = len(txt)
  80         self.file.write(txt)
  81         pos = (self.position, l)
  82         self.position += l
  83         return pos
  84
  85     def get(self, pos):
  86         """
  87         Given a tuple of (position, length) return an unicode
  88         of the snippet stored there.
  89         """
  90         self.file.seek(pos[0], 0)
  91         txt = self.file.read(pos[1]).decode('utf-8')
  92         return txt
  93
  94     def close(self):
  95         """Close snippet file"""
  96         if self.file:
  97             self.file.close()
  98
  99     def remove(self):
 100         self.revision = None
 101         try:
 102             os.unlink(self.path)
 103             self.revision = 0
 104             while True:
 105                 self.revision += 1
 106                 os.unlink(self.path)
 107         except OSError:
 108             pass
 109
 110
 111 class Index(SolrIndex):
 112     """
 113     Class indexing books.
 114     """
 115     def __init__(self):
 116         super(Index, self).__init__(mode='rw')
 117
 118     def delete_query(self, *queries):
 119         """
 120         index.delete(queries=...) doesn't work, so let's reimplement it
 121         using deletion of list of uids.
 122         """
 123         uids = set()
 124         for q in queries:
 125             if isinstance(q, sunburnt.search.LuceneQuery):
 126                 q = self.index.query(q)
 127             q.field_limiter.update(['uid'])
 128             st = 0
 129             rows = 100
 130             while True:
 131                 ids = q.paginate(start=st, rows=rows).execute()
 132                 if not len(ids):
 133                     break
 134                 for res in ids:
 135                     uids.add(res['uid'])
 136                 st += rows
 137                 #        print "Will delete %s" % ','.join([x for x in uids])
 138         if uids:
 139             self.index.delete(uids)
 140             return True
 141         else:
 142             return False
 143
 144     def index_tags(self, *tags, **kw):
 145         """
 146         Re-index global tag list.
 147         Removes all tags from index, then index them again.
 148         Indexed fields include: id, name (with and without polish stems), category
 149         """
 150         log.debug("Indexing tags")
 151         remove_only = kw.get('remove_only', False)
 152         # first, remove tags from index.
 153         if tags:
 154             tag_qs = []
 155             for tag in tags:
 156                 q_id = self.index.Q(tag_id=tag.id)
 157
 158                 if isinstance(tag, PDCounterAuthor):
 159                     q_cat = self.index.Q(tag_category='pd_author')
 160                 elif isinstance(tag, PDCounterBook):
 161                     q_cat = self.index.Q(tag_category='pd_book')
 162                 else:
 163                     q_cat = self.index.Q(tag_category=tag.category)
 164
 165                 q_id_cat = self.index.Q(q_id & q_cat)
 166                 tag_qs.append(q_id_cat)
 167             self.delete_query(tag_qs)
 168         else:  # all
 169             q = self.index.Q(tag_id__any=True)
 170             self.delete_query(q)
 171
 172         if not remove_only:
 173             # then add them [all or just one passed]
 174             if not tags:
 175                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 176                     PDCounterAuthor.objects.all(), \
 177                     PDCounterBook.objects.all())
 178
 179             for tag in tags:
 180                 if isinstance(tag, PDCounterAuthor):
 181                     doc = {
 182                         "tag_id": int(tag.id),
 183                         "tag_name": tag.name,
 184                         "tag_name_pl": tag.name,
 185                         "tag_category": 'pd_author',
 186                         "is_pdcounter": True,
 187                         "uid": "tag%d_pd_a" % tag.id
 188                         }
 189                 elif isinstance(tag, PDCounterBook):
 190                     doc = {
 191                         "tag_id": int(tag.id),
 192                         "tag_name": tag.title,
 193                         "tag_name_pl": tag.title,
 194                         "tag_category": 'pd_book',
 195                         "is_pdcounter": True,
 196                         "uid": "tag%d_pd_b" % tag.id
 197                         }
 198                 else:
 199                     doc = {
 200                         "tag_id": int(tag.id),
 201                         "tag_name": tag.name,
 202                         "tag_name_pl": tag.name,
 203                         "tag_category": tag.category,
 204                         "is_pdcounter": False,
 205                         "uid": "tag%d" % tag.id
 206                         }
 207                 self.index.add(doc)
 208
 209     def create_book_doc(self, book):
 210         """
 211         Create a lucene document referring book id.
 212         """
 213         doc = {
 214             'book_id': int(book.id),
 215             }
 216         if book.parent is not None:
 217             doc["parent_id"] = int(book.parent.id)
 218         return doc
 219
 220     def remove_book(self, book_or_id, remove_snippets=True):
 221         """Removes a book from search index.
 222         book - Book instance."""
 223         if isinstance(book_or_id, catalogue.models.Book):
 224             book_id = book_or_id.id
 225         else:
 226             book_id = book_or_id
 227
 228         self.delete_query(self.index.Q(book_id=book_id))
 229
 230         if remove_snippets:
 231             snippets = Snippets(book_id)
 232             snippets.remove()
 233
 234     def index_book(self, book, book_info=None, overwrite=True):
 235         """
 236         Indexes the book.
 237         Creates a lucene document for extracted metadata
 238         and calls self.index_content() to index the contents of the book.
 239         """
 240         if overwrite:
 241             # we don't remove snippets, since they might be still needed by
 242             # threads using not reopened index
 243             self.remove_book(book, remove_snippets=False)
 244
 245         book_doc = self.create_book_doc(book)
 246         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 247         # let's not index it - it's only used for extracting publish date
 248         if 'source_name' in meta_fields:
 249             del meta_fields['source_name']
 250
 251         for n, f in meta_fields.items():
 252             book_doc[n] = f
 253
 254         book_doc['uid'] = "book%s" % book_doc['book_id']
 255         self.index.add(book_doc)
 256         del book_doc
 257         book_fields = {
 258             'title': meta_fields['title'],
 259             'authors': meta_fields['authors'],
 260             'published_date': meta_fields['published_date']
 261             }
 262
 263         if 'translators' in meta_fields:
 264             book_fields['translators'] = meta_fields['translators']
 265
 266         self.index_content(book, book_fields=book_fields)
 267
 268     master_tags = [
 269         'opowiadanie',
 270         'powiesc',
 271         'dramat_wierszowany_l',
 272         'dramat_wierszowany_lp',
 273         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 274         'wywiad',
 275         ]
 276
 277     ignore_content_tags = [
 278         'uwaga', 'extra',
 279         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 280         'didaskalia',
 281         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 282         ]
 283
 284     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 285
 286     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 287
 288     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 289
 290     def extract_metadata(self, book, book_info=None, dc_only=None):
 291         """
 292         Extract metadata from book and returns a map of fields keyed by fieldname
 293         """
 294         fields = {}
 295
 296         if book_info is None:
 297             book_info = dcparser.parse(open(book.xml_file.path))
 298
 299         fields['slug'] = book.slug
 300         fields['tags'] = [t.name  for t in book.tags]
 301         fields['is_book'] = True
 302
 303         # validator, name
 304         for field in dcparser.BookInfo.FIELDS:
 305             if dc_only and field.name not in dc_only:
 306                 continue
 307             if hasattr(book_info, field.name):
 308                 if not getattr(book_info, field.name):
 309                     continue
 310                 # since no type information is available, we use validator
 311                 type_indicator = field.validator
 312                 if type_indicator == dcparser.as_unicode:
 313                     s = getattr(book_info, field.name)
 314                     if field.multiple:
 315                         s = ', '.join(s)
 316                     fields[field.name] = s
 317                 elif type_indicator == dcparser.as_person:
 318                     p = getattr(book_info, field.name)
 319                     if isinstance(p, dcparser.Person):
 320                         persons = unicode(p)
 321                     else:
 322                         persons = ', '.join(map(unicode, p))
 323                     fields[field.name] = persons
 324                 elif type_indicator == dcparser.as_date:
 325                     dt = getattr(book_info, field.name)
 326                     fields[field.name] = dt
 327
 328         # get published date
 329         pd = None
 330         if hasattr(book_info, 'source_name') and book_info.source_name:
 331             match = self.published_date_re.search(book_info.source_name)
 332             if match is not None:
 333                 pd = str(match.groups()[0])
 334         if not pd: pd = ""
 335         fields["published_date"] = pd
 336
 337         return fields
 338
 339     # def add_gaps(self, fields, fieldname):
 340     #     """
 341     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 342     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 343     #     """
 344     #     def gap():
 345     #         while True:
 346     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 347     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 348
 349     def get_master(self, root):
 350         """
 351         Returns the first master tag from an etree.
 352         """
 353         for master in root.iter():
 354             if master.tag in self.master_tags:
 355                 return master
 356
 357     def index_content(self, book, book_fields={}):
 358         """
 359         Walks the book XML and extract content from it.
 360         Adds parts for each header tag and for each fragment.
 361         """
 362         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 363         root = wld.edoc.getroot()
 364
 365         master = self.get_master(root)
 366         if master is None:
 367             return []
 368
 369         def walker(node, ignore_tags=[]):
 370
 371             if node.tag not in ignore_tags:
 372                 yield node, None, None
 373                 if node.text is not None:
 374                     yield None, node.text, None
 375                 for child in list(node):
 376                     for b, t, e in walker(child):
 377                         yield b, t, e
 378                 yield None, None, node
 379
 380             if node.tail is not None:
 381                 yield None, node.tail, None
 382             return
 383
 384         def fix_format(text):
 385             #            separator = [u" ", u"\t", u".", u";", u","]
 386             if isinstance(text, list):
 387                 # need to join it first
 388                 text = filter(lambda s: s is not None, content)
 389                 text = u' '.join(text)
 390                 # for i in range(len(text)):
 391                 #     if i > 0:
 392                 #         if text[i][0] not in separator\
 393                 #             and text[i - 1][-1] not in separator:
 394                 #          text.insert(i, u" ")
 395
 396             return re.sub("(?m)/$", "", text)
 397
 398         def add_part(snippets, **fields):
 399             doc = self.create_book_doc(book)
 400             for n, v in book_fields.items():
 401                 doc[n] = v
 402
 403             doc['header_index'] = fields["header_index"]
 404             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 405             doc['header_type'] = fields['header_type']
 406
 407             doc['text'] = fields['text']
 408
 409             # snippets
 410             snip_pos = snippets.add(fields["text"])
 411
 412             doc['snippets_position'] = snip_pos[0]
 413             doc['snippets_length'] = snip_pos[1]
 414             if snippets.revision:
 415                 doc["snippets_revision"] = snippets.revision
 416
 417             if 'fragment_anchor' in fields:
 418                 doc["fragment_anchor"] = fields['fragment_anchor']
 419
 420             if 'themes' in fields:
 421                 doc['themes'] = fields['themes']
 422             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 423                                          doc['header_span'],
 424                                          doc.get('fragment_anchor', ''))
 425             return doc
 426
 427         def give_me_utf8(s):
 428             if isinstance(s, unicode):
 429                 return s.encode('utf-8')
 430             else:
 431                 return s
 432
 433         fragments = {}
 434         snippets = Snippets(book.id).open('w')
 435         try:
 436             for header, position in zip(list(master), range(len(master))):
 437
 438                 if header.tag in self.skip_header_tags:
 439                     continue
 440                 if header.tag is etree.Comment:
 441                     continue
 442
 443                 # section content
 444                 content = []
 445                 footnote = []
 446
 447                 def all_content(text):
 448                     for frag in fragments.values():
 449                         frag['text'].append(text)
 450                     content.append(text)
 451                 handle_text = [all_content]
 452
 453                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 454                     # handle footnotes
 455                     if start is not None and start.tag in self.footnote_tags:
 456                         footnote = []
 457
 458                         def collect_footnote(t):
 459                             footnote.append(t)
 460
 461                         handle_text.append(collect_footnote)
 462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 463                         handle_text.pop()
 464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 465                                        text=u''.join(footnote),
 466                                        is_footnote=True)
 467                         self.index.add(doc)
 468                         #print "@ footnote text: %s" % footnote
 469                         footnote = []
 470
 471                     # handle fragments and themes.
 472                     if start is not None and start.tag == 'begin':
 473                         fid = start.attrib['id'][1:]
 474                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if frag['themes'] == []:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        text=fix_format(frag['text']),
 500                                        themes=frag['themes'])
 501                         #print '@ FRAG %s' % frag['content']
 502                         self.index.add(doc)
 503
 504                         # Collect content.
 505
 506                     if text is not None and handle_text is not []:
 507                         hdl = handle_text[-1]
 508                         if hdl is not None:
 509                             hdl(text)
 510
 511                         # in the end, add a section text.
 512                 doc = add_part(snippets, header_index=position,
 513                                header_type=header.tag, text=fix_format(content))
 514                 #print '@ CONTENT: %s' % fix_format(content)
 515
 516                 self.index.add(doc)
 517
 518         finally:
 519             snippets.close()
 520
 521
 522 class SearchResult(object):
 523     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 524         #        self.search = search
 525         self.boost = 1.0
 526         self._hits = []
 527         self._processed_hits = None  # processed hits
 528         self.snippets = []
 529         self.query_terms = query_terms
 530
 531         if 'score' in doc:
 532             self._score = doc['score']
 533         else:
 534             self._score = 0
 535
 536         self.book_id = int(doc["book_id"])
 537
 538         try:
 539             self.published_date = int(doc.get("published_date"))
 540         except ValueError:
 541             self.published_date = 0
 542
 543         # content hits
 544         header_type = doc.get("header_type", None)
 545         # we have a content hit in some header of fragment
 546         if header_type is not None:
 547             sec = (header_type, int(doc["header_index"]))
 548             header_span = doc['header_span']
 549             header_span = header_span is not None and int(header_span) or 1
 550             fragment = doc.get("fragment_anchor", None)
 551             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 552             snippets_rev = doc.get('snippets_revision', None)
 553
 554             hit = (sec + (header_span,), fragment, self._score, {
 555                 'how_found': how_found,
 556                 'snippets_pos': snippets_pos,
 557                 'snippets_revision': snippets_rev,
 558                 'themes': doc.get('themes', []),
 559                 'themes_pl': doc.get('themes_pl', [])
 560                 })
 561
 562             self._hits.append(hit)
 563
 564     def __unicode__(self):
 565         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 566             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 567
 568     def __str__(self):
 569         return unicode(self).encode('utf-8')
 570
 571     @property
 572     def score(self):
 573         return self._score * self.boost
 574
 575     def merge(self, other):
 576         if self.book_id != other.book_id:
 577             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 578         self._hits += other._hits
 579         if other.score > self.score:
 580             self._score = other._score
 581         return self
 582
 583     def get_book(self):
 584         if hasattr(self, '_book'):
 585             return self._book
 586         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 587         return self._book
 588
 589     book = property(get_book)
 590
 591     POSITION = 0
 592     FRAGMENT = 1
 593     POSITION_INDEX = 1
 594     POSITION_SPAN = 2
 595     SCORE = 2
 596     OTHER = 3
 597
 598     @property
 599     def hits(self):
 600         if self._processed_hits is not None:
 601             return self._processed_hits
 602
 603         # to sections and fragments
 604         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 605
 606         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 607
 608         # sections not covered by fragments
 609         sect = filter(lambda s: 0 == len(filter(
 610             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 611             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 612             frags)), sect)
 613
 614         hits = []
 615
 616         def remove_duplicates(lst, keyfn, compare):
 617             els = {}
 618             for e in lst:
 619                 eif = keyfn(e)
 620                 if eif in els:
 621                     if compare(els[eif], e) >= 1:
 622                         continue
 623                 els[eif] = e
 624             return els.values()
 625
 626         # remove fragments with duplicated fid's and duplicated snippets
 627         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 628         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 629         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 630
 631         # remove duplicate sections
 632         sections = {}
 633
 634         for s in sect:
 635             si = s[self.POSITION][self.POSITION_INDEX]
 636             # skip existing
 637             if si in sections:
 638                 if sections[si]['score'] >= s[self.SCORE]:
 639                     continue
 640
 641             m = {'score': s[self.SCORE],
 642                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 643                  }
 644             m.update(s[self.OTHER])
 645             sections[si] = m
 646
 647         hits = sections.values()
 648
 649         for f in frags:
 650             try:
 651                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 652             except catalogue.models.Fragment.DoesNotExist:
 653                 # stale index
 654                 continue
 655             # Figure out if we were searching for a token matching some word in theme name.
 656             themes = frag.tags.filter(category='theme')
 657             themes_hit = set()
 658             if self.query_terms is not None:
 659                 for i in range(0, len(f[self.OTHER]['themes'])):
 660                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 661                     tms = map(unicode.lower, tms)
 662                     for qt in self.query_terms:
 663                         if qt in tms:
 664                             themes_hit.add(f[self.OTHER]['themes'][i])
 665                             break
 666
 667             def theme_by_name(n):
 668                 th = filter(lambda t: t.name == n, themes)
 669                 if th:
 670                     return th[0]
 671                 else:
 672                     return None
 673             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 674
 675             m = {'score': f[self.SCORE],
 676                  'fragment': frag,
 677                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 678                  'themes': themes,
 679                  'themes_hit': themes_hit
 680                  }
 681             m.update(f[self.OTHER])
 682             hits.append(m)
 683
 684         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 685
 686         self._processed_hits = hits
 687
 688         return hits
 689
 690     @staticmethod
 691     def aggregate(*result_lists):
 692         books = {}
 693         for rl in result_lists:
 694             for r in rl:
 695                 if r.book_id in books:
 696                     books[r.book_id].merge(r)
 697                 else:
 698                     books[r.book_id] = r
 699         return books.values()
 700
 701     def __cmp__(self, other):
 702         c = cmp(self.score, other.score)
 703         if c == 0:
 704             # this is inverted, because earlier date is better
 705             return cmp(other.published_date, self.published_date)
 706         else:
 707             return c
 708
 709     def __len__(self):
 710         return len(self.hits)
 711
 712     def snippet_pos(self, idx=0):
 713         return self.hits[idx]['snippets_pos']
 714
 715     def snippet_revision(self, idx=0):
 716         try:
 717             return self.hits[idx]['snippets_revision']
 718         except:
 719             return None
 720
 721
 722 class Search(SolrIndex):
 723     """
 724     Search facilities.
 725     """
 726     def __init__(self, default_field="text"):
 727         super(Search, self).__init__(mode='r')
 728
 729
 730     def make_term_query(self, query, field='text', modal=operator.or_):
 731         """
 732         Returns term queries joined by boolean query.
 733         modal - applies to boolean query
 734         fuzzy - should the query by fuzzy.
 735         """
 736         if query is None: query = ''
 737         q = self.index.Q()
 738         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 739                         query.split(r" ")), q)
 740
 741         return q
 742
 743     def search_phrase(self, searched, field='text', book=False,
 744                       filters=None,
 745                       snippets=False):
 746         if filters is None: filters = []
 747         if book: filters.append(self.index.Q(is_book=True))
 748
 749         q = self.index.query(**{field: searched})
 750         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 751         res = q.execute()
 752         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 753
 754     def search_some(self, searched, fields, book=True,
 755                     filters=None, snippets=True, query_terms=None):
 756         assert isinstance(fields, list)
 757         if filters is None: filters = []
 758         if book: filters.append(self.index.Q(is_book=True))
 759
 760         query = self.index.Q()
 761
 762         for fld in fields:
 763             query = self.index.Q(query | self.make_term_query(searched, fld))
 764
 765         query = self.index.query(query)
 766         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 767         res = query.execute()
 768         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 769
 770
 771     def search_everywhere(self, searched, query_terms=None):
 772         """
 773         Tries to use search terms to match different fields of book (or its parts).
 774         E.g. one word can be an author survey, another be a part of the title, and the rest
 775         are some words from third chapter.
 776         """
 777         books = []
 778         # content only query : themes x content
 779         q = self.make_term_query(searched, 'text')
 780         q_themes = self.make_term_query(searched, 'themes_pl')
 781
 782         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 783         res = query.execute()
 784
 785         for found in res:
 786             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 787
 788         # query themes/content x author/title/tags
 789         in_content = self.index.Q()
 790         in_meta = self.index.Q()
 791
 792         for fld in ['themes_pl', 'text']:
 793             in_content |= self.make_term_query(searched, field=fld)
 794
 795         for fld in ['tags', 'authors', 'title']:
 796             in_meta |= self.make_term_query(searched, field=fld)
 797
 798         q = in_content & in_meta
 799         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 800
 801         for found in res:
 802             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 803
 804         return books
 805
 806     def get_snippets(self, searchresult, query, field='text', num=1):
 807         """
 808         Returns a snippet for found scoreDoc.
 809         """
 810         maxnum = len(searchresult)
 811         if num is None or num < 0 or num > maxnum:
 812             num = maxnum
 813         book_id = searchresult.book_id
 814         revision = searchresult.snippet_revision()
 815         snippets = Snippets(book_id, revision=revision)
 816         snips = [None] * maxnum
 817         try:
 818             snippets.open()
 819             idx = 0
 820             while idx < maxnum and num > 0:
 821                 position, length = searchresult.snippet_pos(idx)
 822                 if position is None or length is None:
 823                     continue
 824                 text = snippets.get((int(position),
 825                                      int(length)))
 826                 snip = self.index.highlight(text=text, field=field, q=query)
 827                 snips[idx] = snip
 828                 if snip:
 829                     num -= 1
 830                 idx += 1
 831
 832         except IOError, e:
 833             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 834             return []
 835         finally:
 836             snippets.close()
 837
 838             # remove verse end markers..
 839         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 840
 841         searchresult.snippets = snips
 842
 843         return snips
 844
 845     def hint_tags(self, query, pdcounter=True, prefix=True):
 846         """
 847         Return auto-complete hints for tags
 848         using prefix search.
 849         """
 850         q = self.index.Q()
 851         query = query.strip()
 852         for field in ['tag_name', 'tag_name_pl']:
 853             if prefix:
 854                 q |= self.index.Q(**{field: query + "*"})
 855             else:
 856                 q |= self.make_term_query(query, field=field)
 857         qu = self.index.query(q).exclude(tag_category="book")
 858
 859         return self.search_tags(qu, pdcounter=pdcounter)
 860
 861     def search_tags(self, query, filters=None, pdcounter=False):
 862         """
 863         Search for Tag objects using query.
 864         """
 865         if not filters: filters = []
 866         if not pdcounter:
 867             filters.append(~self.index.Q(is_pdcounter=True))
 868         res = self.apply_filters(query, filters).execute()
 869
 870         tags = []
 871         pd_tags = []
 872
 873         for doc in res:
 874             is_pdcounter = doc.get('is_pdcounter', False)
 875             category = doc.get('tag_category')
 876             try:
 877                 if is_pdcounter == True:
 878                     if category == 'pd_author':
 879                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 880                     elif category == 'pd_book':
 881                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 882                         tag.category = 'pd_book'  # make it look more lik a tag.
 883                     else:
 884                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
 885                     pd_tags.append(tag)
 886                 else:
 887                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 888                     tags.append(tag)
 889
 890             except catalogue.models.Tag.DoesNotExist: pass
 891             except PDCounterAuthor.DoesNotExist: pass
 892             except PDCounterBook.DoesNotExist: pass
 893
 894         tags_slugs = set(map(lambda t: t.slug, tags))
 895         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
 896
 897         log.debug('search_tags: %s' % tags)
 898
 899         return tags
 900
 901     def hint_books(self, query, prefix=True):
 902         """
 903         Returns auto-complete hints for book titles
 904         Because we do not index 'pseudo' title-tags.
 905         Prefix search.
 906         """
 907         q = self.index.Q()
 908         query = query.strip()
 909         if prefix:
 910             q |= self.index.Q(title=query + "*")
 911         else:
 912             q |= self.make_term_query(query, field='title')
 913         qu = self.index.query(q)
 914         only_books = self.index.Q(is_book=True)
 915         return self.search_books(qu, [only_books])
 916
 917     def search_books(self, query, filters=None, max_results=10):
 918         """
 919         Searches for Book objects using query
 920         """
 921         bks = []
 922         bks_found = set()
 923         query = query.query(is_book=True)
 924         res = self.apply_filters(query, filters).field_limit(['book_id'])
 925         for r in res:
 926             try:
 927                 bid = r['book_id']
 928                 if not bid in bks_found:
 929                     bks.append(catalogue.models.Book.objects.get(id=bid))
 930                     bks_found.add(bid)
 931             except catalogue.models.Book.DoesNotExist: pass
 932         return bks
 933
 934
 935     @staticmethod
 936     def apply_filters(query, filters):
 937         """
 938         Apply filters to a query
 939         """
 940         if filters is None: filters = []
 941         filters = filter(lambda x: x is not None, filters)
 942         for f in filters:
 943             query = query.query(f)
 944         return query