apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21 log = logging.getLogger('search')
  22
  23 class SolrIndex(object):
  24     def __init__(self, mode=None):
  25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  26
  27
  28 class Snippets(object):
  29     """
  30     This class manages snippet files for indexed object (book)
  31     the snippets are concatenated together, and their positions and
  32     lengths are kept in lucene index fields.
  33     """
  34     SNIPPET_DIR = "snippets"
  35
  36     def __init__(self, book_id, revision=None):
  37         try:
  38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         except OSError as exc:
  40             if exc.errno == errno.EEXIST:
  41                 pass
  42             else: raise
  43         self.book_id = book_id
  44         self.revision = revision
  45         self.file = None
  46
  47     @property
  48     def path(self):
  49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  50         else: fn = "%d" % self.book_id
  51
  52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  53
  54     def open(self, mode='r'):
  55         """
  56         Open the snippet file. Call .close() afterwards.
  57         """
  58         if not 'b' in mode:
  59             mode += 'b'
  60
  61         if 'w' in mode:
  62             if os.path.exists(self.path):
  63                 self.revision = 1
  64                 while True:
  65                     if not os.path.exists(self.path):
  66                         break
  67                     self.revision += 1
  68
  69         self.file = open(self.path, mode)
  70         self.position = 0
  71         return self
  72
  73     def add(self, snippet):
  74         """
  75         Append a snippet (unicode) to the snippet file.
  76         Return a (position, length) tuple
  77         """
  78         txt = snippet.encode('utf-8')
  79         l = len(txt)
  80         self.file.write(txt)
  81         pos = (self.position, l)
  82         self.position += l
  83         return pos
  84
  85     def get(self, pos):
  86         """
  87         Given a tuple of (position, length) return an unicode
  88         of the snippet stored there.
  89         """
  90         self.file.seek(pos[0], 0)
  91         txt = self.file.read(pos[1]).decode('utf-8')
  92         return txt
  93
  94     def close(self):
  95         """Close snippet file"""
  96         if self.file:
  97             self.file.close()
  98
  99     def remove(self):
 100         self.revision = None
 101         try:
 102             os.unlink(self.path)
 103             self.revision = 0
 104             while True:
 105                 self.revision += 1
 106                 os.unlink(self.path)
 107         except OSError:
 108             pass
 109
 110
 111 class Index(SolrIndex):
 112     """
 113     Class indexing books.
 114     """
 115     def __init__(self):
 116         super(Index, self).__init__(mode='rw')
 117
 118     def delete_query(self, *queries):
 119         """
 120         index.delete(queries=...) doesn't work, so let's reimplement it
 121         using deletion of list of uids.
 122         """
 123         uids = set()
 124         for q in queries:
 125             if isinstance(q, sunburnt.search.LuceneQuery):
 126                 q = self.index.query(q)
 127             q.field_limiter.update(['uid'])
 128             st = 0
 129             rows = 100
 130             while True:
 131                 ids = q.paginate(start=st, rows=rows).execute()
 132                 if not len(ids):
 133                     break
 134                 for res in ids:
 135                     uids.add(res['uid'])
 136                 st += rows
 137         if uids:
 138             self.index.delete(uids)
 139             return True
 140         else:
 141             return False
 142
 143     def index_tags(self, *tags, **kw):
 144         """
 145         Re-index global tag list.
 146         Removes all tags from index, then index them again.
 147         Indexed fields include: id, name (with and without polish stems), category
 148         """
 149         log.debug("Indexing tags")
 150         remove_only = kw.get('remove_only', False)
 151         # first, remove tags from index.
 152         if tags:
 153             tag_qs = []
 154             for tag in tags:
 155                 q_id = self.index.Q(tag_id=tag.id)
 156
 157                 if isinstance(tag, PDCounterAuthor):
 158                     q_cat = self.index.Q(tag_category='pd_author')
 159                 elif isinstance(tag, PDCounterBook):
 160                     q_cat = self.index.Q(tag_category='pd_book')
 161                 else:
 162                     q_cat = self.index.Q(tag_category=tag.category)
 163
 164                 q_id_cat = self.index.Q(q_id & q_cat)
 165                 tag_qs.append(q_id_cat)
 166             self.delete_query(tag_qs)
 167         else:  # all
 168             q = self.index.Q(tag_id__any=True)
 169             self.delete_query(q)
 170
 171         if not remove_only:
 172             # then add them [all or just one passed]
 173             if not tags:
 174                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 175                     PDCounterAuthor.objects.all(), \
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {
 213             'book_id': int(book.id),
 214             }
 215         if book.parent is not None:
 216             doc["parent_id"] = int(book.parent.id)
 217         return doc
 218
 219     def remove_book(self, book_or_id, remove_snippets=True):
 220         """Removes a book from search index.
 221         book - Book instance."""
 222         if isinstance(book_or_id, catalogue.models.Book):
 223             book_id = book_or_id.id
 224         else:
 225             book_id = book_or_id
 226
 227         self.delete_query(self.index.Q(book_id=book_id))
 228
 229         if remove_snippets:
 230             snippets = Snippets(book_id)
 231             snippets.remove()
 232
 233     def index_book(self, book, book_info=None, overwrite=True):
 234         """
 235         Indexes the book.
 236         Creates a lucene document for extracted metadata
 237         and calls self.index_content() to index the contents of the book.
 238         """
 239         if overwrite:
 240             # we don't remove snippets, since they might be still needed by
 241             # threads using not reopened index
 242             self.remove_book(book, remove_snippets=False)
 243
 244         book_doc = self.create_book_doc(book)
 245         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 246         # let's not index it - it's only used for extracting publish date
 247         if 'source_name' in meta_fields:
 248             del meta_fields['source_name']
 249
 250         for n, f in meta_fields.items():
 251             book_doc[n] = f
 252
 253         book_doc['uid'] = "book%s" % book_doc['book_id']
 254         self.index.add(book_doc)
 255         del book_doc
 256         book_fields = {
 257             'title': meta_fields['title'],
 258             'authors': meta_fields['authors'],
 259             'published_date': meta_fields['published_date']
 260             }
 261
 262         if 'translators' in meta_fields:
 263             book_fields['translators'] = meta_fields['translators']
 264
 265         self.index_content(book, book_fields=book_fields)
 266
 267     master_tags = [
 268         'opowiadanie',
 269         'powiesc',
 270         'dramat_wierszowany_l',
 271         'dramat_wierszowany_lp',
 272         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 273         'wywiad',
 274         ]
 275
 276     ignore_content_tags = [
 277         'uwaga', 'extra',
 278         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 279         'didaskalia',
 280         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 281         ]
 282
 283     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 284
 285     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 286
 287     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 288
 289     def extract_metadata(self, book, book_info=None, dc_only=None):
 290         """
 291         Extract metadata from book and returns a map of fields keyed by fieldname
 292         """
 293         fields = {}
 294
 295         if book_info is None:
 296             book_info = dcparser.parse(open(book.xml_file.path))
 297
 298         fields['slug'] = book.slug
 299         fields['tags'] = [t.name  for t in book.tags]
 300         fields['is_book'] = True
 301
 302         # validator, name
 303         for field in dcparser.BookInfo.FIELDS:
 304             if dc_only and field.name not in dc_only:
 305                 continue
 306             if hasattr(book_info, field.name):
 307                 if not getattr(book_info, field.name):
 308                     continue
 309                 # since no type information is available, we use validator
 310                 type_indicator = field.validator
 311                 if type_indicator == dcparser.as_unicode:
 312                     s = getattr(book_info, field.name)
 313                     if field.multiple:
 314                         s = ', '.join(s)
 315                     fields[field.name] = s
 316                 elif type_indicator == dcparser.as_person:
 317                     p = getattr(book_info, field.name)
 318                     if isinstance(p, dcparser.Person):
 319                         persons = unicode(p)
 320                     else:
 321                         persons = ', '.join(map(unicode, p))
 322                     fields[field.name] = persons
 323                 elif type_indicator == dcparser.as_date:
 324                     dt = getattr(book_info, field.name)
 325                     fields[field.name] = dt
 326
 327         # get published date
 328         pd = None
 329         if hasattr(book_info, 'source_name') and book_info.source_name:
 330             match = self.published_date_re.search(book_info.source_name)
 331             if match is not None:
 332                 pd = str(match.groups()[0])
 333         if not pd: pd = ""
 334         fields["published_date"] = pd
 335
 336         return fields
 337
 338     # def add_gaps(self, fields, fieldname):
 339     #     """
 340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 342     #     """
 343     #     def gap():
 344     #         while True:
 345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 347
 348     def get_master(self, root):
 349         """
 350         Returns the first master tag from an etree.
 351         """
 352         for master in root.iter():
 353             if master.tag in self.master_tags:
 354                 return master
 355
 356     def index_content(self, book, book_fields={}):
 357         """
 358         Walks the book XML and extract content from it.
 359         Adds parts for each header tag and for each fragment.
 360         """
 361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 362         root = wld.edoc.getroot()
 363
 364         master = self.get_master(root)
 365         if master is None:
 366             return []
 367
 368         def walker(node, ignore_tags=[]):
 369
 370             if node.tag not in ignore_tags:
 371                 yield node, None, None
 372                 if node.text is not None:
 373                     yield None, node.text, None
 374                 for child in list(node):
 375                     for b, t, e in walker(child):
 376                         yield b, t, e
 377                 yield None, None, node
 378
 379             if node.tail is not None:
 380                 yield None, node.tail, None
 381             return
 382
 383         def fix_format(text):
 384             #            separator = [u" ", u"\t", u".", u";", u","]
 385             if isinstance(text, list):
 386                 # need to join it first
 387                 text = filter(lambda s: s is not None, content)
 388                 text = u' '.join(text)
 389                 # for i in range(len(text)):
 390                 #     if i > 0:
 391                 #         if text[i][0] not in separator\
 392                 #             and text[i - 1][-1] not in separator:
 393                 #          text.insert(i, u" ")
 394
 395             return re.sub("(?m)/$", "", text)
 396
 397         def add_part(snippets, **fields):
 398             doc = self.create_book_doc(book)
 399             for n, v in book_fields.items():
 400                 doc[n] = v
 401
 402             doc['header_index'] = fields["header_index"]
 403             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 404             doc['header_type'] = fields['header_type']
 405
 406             doc['text'] = fields['text']
 407
 408             # snippets
 409             snip_pos = snippets.add(fields["text"])
 410
 411             doc['snippets_position'] = snip_pos[0]
 412             doc['snippets_length'] = snip_pos[1]
 413             if snippets.revision:
 414                 doc["snippets_revision"] = snippets.revision
 415
 416             if 'fragment_anchor' in fields:
 417                 doc["fragment_anchor"] = fields['fragment_anchor']
 418
 419             if 'themes' in fields:
 420                 doc['themes'] = fields['themes']
 421             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 422                                          doc['header_span'],
 423                                          doc.get('fragment_anchor', ''))
 424             return doc
 425
 426         def give_me_utf8(s):
 427             if isinstance(s, unicode):
 428                 return s.encode('utf-8')
 429             else:
 430                 return s
 431
 432         fragments = {}
 433         snippets = Snippets(book.id).open('w')
 434         try:
 435             for header, position in zip(list(master), range(len(master))):
 436
 437                 if header.tag in self.skip_header_tags:
 438                     continue
 439                 if header.tag is etree.Comment:
 440                     continue
 441
 442                 # section content
 443                 content = []
 444                 footnote = []
 445
 446                 def all_content(text):
 447                     for frag in fragments.values():
 448                         frag['text'].append(text)
 449                     content.append(text)
 450                 handle_text = [all_content]
 451
 452                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 453                     # handle footnotes
 454                     if start is not None and start.tag in self.footnote_tags:
 455                         footnote = []
 456
 457                         def collect_footnote(t):
 458                             footnote.append(t)
 459
 460                         handle_text.append(collect_footnote)
 461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 462                         handle_text.pop()
 463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 464                                        text=u''.join(footnote),
 465                                        is_footnote=True)
 466                         self.index.add(doc)
 467                         footnote = []
 468
 469                     # handle fragments and themes.
 470                     if start is not None and start.tag == 'begin':
 471                         fid = start.attrib['id'][1:]
 472                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 473
 474                     # themes for this fragment
 475                     elif start is not None and start.tag == 'motyw':
 476                         fid = start.attrib['id'][1:]
 477                         handle_text.append(None)
 478                         if start.text is not None:
 479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 480                     elif end is not None and end.tag == 'motyw':
 481                         handle_text.pop()
 482
 483                     elif start is not None and start.tag == 'end':
 484                         fid = start.attrib['id'][1:]
 485                         if fid not in fragments:
 486                             continue  # a broken <end> node, skip it
 487                         frag = fragments[fid]
 488                         if frag['themes'] == []:
 489                             continue  # empty themes list.
 490                         del fragments[fid]
 491
 492                         doc = add_part(snippets,
 493                                        header_type=frag['start_header'],
 494                                        header_index=frag['start_section'],
 495                                        header_span=position - frag['start_section'] + 1,
 496                                        fragment_anchor=fid,
 497                                        text=fix_format(frag['text']),
 498                                        themes=frag['themes'])
 499                         self.index.add(doc)
 500
 501                         # Collect content.
 502
 503                     if text is not None and handle_text is not []:
 504                         hdl = handle_text[-1]
 505                         if hdl is not None:
 506                             hdl(text)
 507
 508                         # in the end, add a section text.
 509                 doc = add_part(snippets, header_index=position,
 510                                header_type=header.tag, text=fix_format(content))
 511
 512                 self.index.add(doc)
 513
 514         finally:
 515             snippets.close()
 516
 517
 518 class SearchResult(object):
 519     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 520         #        self.search = search
 521         self.boost = 1.0
 522         self._hits = []
 523         self._processed_hits = None  # processed hits
 524         self.snippets = []
 525         self.query_terms = query_terms
 526
 527         if 'score' in doc:
 528             self._score = doc['score']
 529         else:
 530             self._score = 0
 531
 532         self.book_id = int(doc["book_id"])
 533
 534         try:
 535             self.published_date = int(doc.get("published_date"))
 536         except ValueError:
 537             self.published_date = 0
 538
 539         # content hits
 540         header_type = doc.get("header_type", None)
 541         # we have a content hit in some header of fragment
 542         if header_type is not None:
 543             sec = (header_type, int(doc["header_index"]))
 544             header_span = doc['header_span']
 545             header_span = header_span is not None and int(header_span) or 1
 546             fragment = doc.get("fragment_anchor", None)
 547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 548             snippets_rev = doc.get('snippets_revision', None)
 549
 550             hit = (sec + (header_span,), fragment, self._score, {
 551                 'how_found': how_found,
 552                 'snippets_pos': snippets_pos,
 553                 'snippets_revision': snippets_rev,
 554                 'themes': doc.get('themes', []),
 555                 'themes_pl': doc.get('themes_pl', [])
 556                 })
 557
 558             self._hits.append(hit)
 559
 560     def __unicode__(self):
 561         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 562             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 563
 564     def __str__(self):
 565         return unicode(self).encode('utf-8')
 566
 567     @property
 568     def score(self):
 569         return self._score * self.boost
 570
 571     def merge(self, other):
 572         if self.book_id != other.book_id:
 573             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 574         self._hits += other._hits
 575         if other.score > self.score:
 576             self._score = other._score
 577         return self
 578
 579     def get_book(self):
 580         if hasattr(self, '_book'):
 581             return self._book
 582         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 583         return self._book
 584
 585     book = property(get_book)
 586
 587     POSITION = 0
 588     FRAGMENT = 1
 589     POSITION_INDEX = 1
 590     POSITION_SPAN = 2
 591     SCORE = 2
 592     OTHER = 3
 593
 594     @property
 595     def hits(self):
 596         if self._processed_hits is not None:
 597             return self._processed_hits
 598
 599         # to sections and fragments
 600         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 601
 602         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 603
 604         # sections not covered by fragments
 605         sect = filter(lambda s: 0 == len(filter(
 606             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 607             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 608             frags)), sect)
 609
 610         hits = []
 611
 612         def remove_duplicates(lst, keyfn, compare):
 613             els = {}
 614             for e in lst:
 615                 eif = keyfn(e)
 616                 if eif in els:
 617                     if compare(els[eif], e) >= 1:
 618                         continue
 619                 els[eif] = e
 620             return els.values()
 621
 622         # remove fragments with duplicated fid's and duplicated snippets
 623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 626
 627         # remove duplicate sections
 628         sections = {}
 629
 630         for s in sect:
 631             si = s[self.POSITION][self.POSITION_INDEX]
 632             # skip existing
 633             if si in sections:
 634                 if sections[si]['score'] >= s[self.SCORE]:
 635                     continue
 636
 637             m = {'score': s[self.SCORE],
 638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 639                  }
 640             m.update(s[self.OTHER])
 641             sections[si] = m
 642
 643         hits = sections.values()
 644
 645         for f in frags:
 646             try:
 647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 648             except catalogue.models.Fragment.DoesNotExist:
 649                 # stale index
 650                 continue
 651             # Figure out if we were searching for a token matching some word in theme name.
 652             themes = frag.tags.filter(category='theme')
 653             themes_hit = set()
 654             if self.query_terms is not None:
 655                 for i in range(0, len(f[self.OTHER]['themes'])):
 656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 657                     tms = map(unicode.lower, tms)
 658                     for qt in self.query_terms:
 659                         if qt in tms:
 660                             themes_hit.add(f[self.OTHER]['themes'][i])
 661                             break
 662
 663             def theme_by_name(n):
 664                 th = filter(lambda t: t.name == n, themes)
 665                 if th:
 666                     return th[0]
 667                 else:
 668                     return None
 669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 670
 671             m = {'score': f[self.SCORE],
 672                  'fragment': frag,
 673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 674                  'themes': themes,
 675                  'themes_hit': themes_hit
 676                  }
 677             m.update(f[self.OTHER])
 678             hits.append(m)
 679
 680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 681
 682         self._processed_hits = hits
 683
 684         return hits
 685
 686     @staticmethod
 687     def aggregate(*result_lists):
 688         books = {}
 689         for rl in result_lists:
 690             for r in rl:
 691                 if r.book_id in books:
 692                     books[r.book_id].merge(r)
 693                 else:
 694                     books[r.book_id] = r
 695         return books.values()
 696
 697     def __cmp__(self, other):
 698         c = cmp(self.score, other.score)
 699         if c == 0:
 700             # this is inverted, because earlier date is better
 701             return cmp(other.published_date, self.published_date)
 702         else:
 703             return c
 704
 705     def __len__(self):
 706         return len(self.hits)
 707
 708     def snippet_pos(self, idx=0):
 709         return self.hits[idx]['snippets_pos']
 710
 711     def snippet_revision(self, idx=0):
 712         try:
 713             return self.hits[idx]['snippets_revision']
 714         except:
 715             return None
 716
 717
 718 class Search(SolrIndex):
 719     """
 720     Search facilities.
 721     """
 722     def __init__(self, default_field="text"):
 723         super(Search, self).__init__(mode='r')
 724
 725
 726     def make_term_query(self, query, field='text', modal=operator.or_):
 727         """
 728         Returns term queries joined by boolean query.
 729         modal - applies to boolean query
 730         fuzzy - should the query by fuzzy.
 731         """
 732         if query is None: query = ''
 733         q = self.index.Q()
 734         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 735                         query.split(r" ")), q)
 736
 737         return q
 738
 739     def search_phrase(self, searched, field='text', book=False,
 740                       filters=None,
 741                       snippets=False):
 742         if filters is None: filters = []
 743         if book: filters.append(self.index.Q(is_book=True))
 744
 745         q = self.index.query(**{field: searched})
 746         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 747         res = q.execute()
 748         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 749
 750     def search_some(self, searched, fields, book=True,
 751                     filters=None, snippets=True, query_terms=None):
 752         assert isinstance(fields, list)
 753         if filters is None: filters = []
 754         if book: filters.append(self.index.Q(is_book=True))
 755
 756         query = self.index.Q()
 757
 758         for fld in fields:
 759             query = self.index.Q(query | self.make_term_query(searched, fld))
 760
 761         query = self.index.query(query)
 762         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 763         res = query.execute()
 764         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 765
 766
 767     def search_everywhere(self, searched, query_terms=None):
 768         """
 769         Tries to use search terms to match different fields of book (or its parts).
 770         E.g. one word can be an author survey, another be a part of the title, and the rest
 771         are some words from third chapter.
 772         """
 773         books = []
 774         # content only query : themes x content
 775         q = self.make_term_query(searched, 'text')
 776         q_themes = self.make_term_query(searched, 'themes_pl')
 777
 778         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 779         res = query.execute()
 780
 781         for found in res:
 782             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 783
 784         # query themes/content x author/title/tags
 785         in_content = self.index.Q()
 786         in_meta = self.index.Q()
 787
 788         for fld in ['themes_pl', 'text']:
 789             in_content |= self.make_term_query(searched, field=fld)
 790
 791         for fld in ['tags', 'authors', 'title']:
 792             in_meta |= self.make_term_query(searched, field=fld)
 793
 794         q = in_content & in_meta
 795         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 796
 797         for found in res:
 798             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 799
 800         return books
 801
 802     def get_snippets(self, searchresult, query, field='text', num=1):
 803         """
 804         Returns a snippet for found scoreDoc.
 805         """
 806         maxnum = len(searchresult)
 807         if num is None or num < 0 or num > maxnum:
 808             num = maxnum
 809         book_id = searchresult.book_id
 810         revision = searchresult.snippet_revision()
 811         snippets = Snippets(book_id, revision=revision)
 812         snips = [None] * maxnum
 813         try:
 814             snippets.open()
 815             idx = 0
 816             while idx < maxnum and num > 0:
 817                 position, length = searchresult.snippet_pos(idx)
 818                 if position is None or length is None:
 819                     continue
 820                 text = snippets.get((int(position),
 821                                      int(length)))
 822                 snip = self.index.highlight(text=text, field=field, q=query)
 823                 snips[idx] = snip
 824                 if snip:
 825                     num -= 1
 826                 idx += 1
 827
 828         except IOError, e:
 829             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 830             return []
 831         finally:
 832             snippets.close()
 833
 834             # remove verse end markers..
 835         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 836
 837         searchresult.snippets = snips
 838
 839         return snips
 840
 841     def hint_tags(self, query, pdcounter=True, prefix=True):
 842         """
 843         Return auto-complete hints for tags
 844         using prefix search.
 845         """
 846         q = self.index.Q()
 847         query = query.strip()
 848         for field in ['tag_name', 'tag_name_pl']:
 849             if prefix:
 850                 q |= self.index.Q(**{field: query + "*"})
 851             else:
 852                 q |= self.make_term_query(query, field=field)
 853         qu = self.index.query(q).exclude(tag_category="book")
 854
 855         return self.search_tags(qu, pdcounter=pdcounter)
 856
 857     def search_tags(self, query, filters=None, pdcounter=False):
 858         """
 859         Search for Tag objects using query.
 860         """
 861         if not filters: filters = []
 862         if not pdcounter:
 863             filters.append(~self.index.Q(is_pdcounter=True))
 864         res = self.apply_filters(query, filters).execute()
 865
 866         tags = []
 867         pd_tags = []
 868
 869         for doc in res:
 870             is_pdcounter = doc.get('is_pdcounter', False)
 871             category = doc.get('tag_category')
 872             try:
 873                 if is_pdcounter == True:
 874                     if category == 'pd_author':
 875                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 876                     elif category == 'pd_book':
 877                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 878                         tag.category = 'pd_book'  # make it look more lik a tag.
 879                     else:
 880                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
 881                     pd_tags.append(tag)
 882                 else:
 883                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 884                     tags.append(tag)
 885
 886             except catalogue.models.Tag.DoesNotExist: pass
 887             except PDCounterAuthor.DoesNotExist: pass
 888             except PDCounterBook.DoesNotExist: pass
 889
 890         tags_slugs = set(map(lambda t: t.slug, tags))
 891         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
 892
 893         log.debug('search_tags: %s' % tags)
 894
 895         return tags
 896
 897     def hint_books(self, query, prefix=True):
 898         """
 899         Returns auto-complete hints for book titles
 900         Because we do not index 'pseudo' title-tags.
 901         Prefix search.
 902         """
 903         q = self.index.Q()
 904         query = query.strip()
 905         if prefix:
 906             q |= self.index.Q(title=query + "*")
 907         else:
 908             q |= self.make_term_query(query, field='title')
 909         qu = self.index.query(q)
 910         only_books = self.index.Q(is_book=True)
 911         return self.search_books(qu, [only_books])
 912
 913     def search_books(self, query, filters=None, max_results=10):
 914         """
 915         Searches for Book objects using query
 916         """
 917         bks = []
 918         bks_found = set()
 919         query = query.query(is_book=True)
 920         res = self.apply_filters(query, filters).field_limit(['book_id'])
 921         for r in res:
 922             try:
 923                 bid = r['book_id']
 924                 if not bid in bks_found:
 925                     bks.append(catalogue.models.Book.objects.get(id=bid))
 926                     bks_found.add(bid)
 927             except catalogue.models.Book.DoesNotExist: pass
 928         return bks
 929
 930
 931     @staticmethod
 932     def apply_filters(query, filters):
 933         """
 934         Apply filters to a query
 935         """
 936         if filters is None: filters = []
 937         filters = filter(lambda x: x is not None, filters)
 938         for f in filters:
 939             query = query.query(f)
 940         return query