apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import highlight
  19
  20
  21 class SolrIndex(object):
  22     def __init__(self, mode=None):
  23         self.index = highlight.HLSolrInterface(settings.SOLR, mode=mode)
  24
  25
  26 class Snippets(object):
  27     """
  28     This class manages snippet files for indexed object (book)
  29     the snippets are concatenated together, and their positions and
  30     lengths are kept in lucene index fields.
  31     """
  32     SNIPPET_DIR = "snippets"
  33
  34     def __init__(self, book_id, revision=None):
  35         try:
  36             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  37         except OSError as exc:
  38             if exc.errno == errno.EEXIST:
  39                 pass
  40             else: raise
  41         self.book_id = book_id
  42         self.revision = revision
  43         self.file = None
  44
  45     @property
  46     def path(self):
  47         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  48         else: fn = "%d" % self.book_id
  49
  50         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  51
  52     def open(self, mode='r'):
  53         """
  54         Open the snippet file. Call .close() afterwards.
  55         """
  56         if not 'b' in mode:
  57             mode += 'b'
  58
  59         if 'w' in mode:
  60             if os.path.exists(self.path):
  61                 self.revision = 1
  62                 while True:
  63                     if not os.path.exists(self.path):
  64                         break
  65                     self.revision += 1
  66
  67         self.file = open(self.path, mode)
  68         self.position = 0
  69         return self
  70
  71     def add(self, snippet):
  72         """
  73         Append a snippet (unicode) to the snippet file.
  74         Return a (position, length) tuple
  75         """
  76         txt = snippet.encode('utf-8')
  77         l = len(txt)
  78         self.file.write(txt)
  79         pos = (self.position, l)
  80         self.position += l
  81         return pos
  82
  83     def get(self, pos):
  84         """
  85         Given a tuple of (position, length) return an unicode
  86         of the snippet stored there.
  87         """
  88         self.file.seek(pos[0], 0)
  89         txt = self.file.read(pos[1]).decode('utf-8')
  90         return txt
  91
  92     def close(self):
  93         """Close snippet file"""
  94         self.file.close()
  95
  96     def remove(self):
  97         self.revision = None
  98         try:
  99             os.unlink(self.path)
 100             self.revision = 0
 101             while True:
 102                 self.revision += 1
 103                 os.unlink(self.path)
 104         except OSError:
 105             pass
 106
 107
 108 class Index(SolrIndex):
 109     """
 110     Class indexing books.
 111     """
 112     def __init__(self):
 113         super(Index, self).__init__()
 114
 115     def delete_query(self, *queries):
 116         """
 117         index.delete(queries=...) doesn't work, so let's reimplement it
 118         using deletion of list of uids.
 119         """
 120         uids = set()
 121         for q in queries:
 122             if isinstance(q, sunburnt.search.LuceneQuery):
 123                 q = self.index.query(q)
 124             q.field_limiter.update(['uid'])
 125             st = 0
 126             rows = 100
 127             while True:
 128                 ids = q.paginate(start=st, rows=rows).execute()
 129                 if not len(ids):
 130                     break
 131                 for res in ids:
 132                     uids.add(res['uid'])
 133                 st+=rows
 134                 #        print "Will delete %s" % ','.join([x for x in uids])
 135         if uids:
 136             self.index.delete(uids)
 137             return True
 138         else:
 139             return False
 140
 141     def index_tags(self, *tags, **kw):
 142         """
 143         Re-index global tag list.
 144         Removes all tags from index, then index them again.
 145         Indexed fields include: id, name (with and without polish stems), category
 146         """
 147         remove_only = kw.get('remove_only', False)
 148         # first, remove tags from index.
 149         if tags:
 150             tag_qs = []
 151             for tag in tags:
 152                 q_id = self.index.Q(tag_id=tag.id)
 153
 154                 if isinstance(tag, PDCounterAuthor):
 155                     q_cat = self.index.Q(tag_category='pd_author')
 156                 elif isinstance(tag, PDCounterBook):
 157                     q_cat = self.index.Q(tag_category='pd_book')
 158                 else:
 159                     q_cat = self.index.Q(tag_category=tag.category)
 160
 161                 q_id_cat = self.index.Q(q_id & q_cat)
 162                 tag_qs.append(q_id_cat)
 163             self.delete_query(tag_qs)
 164         else:  # all
 165             q = self.index.Q(tag_id__any=True)
 166             self.delete_query(q)
 167
 168         if not remove_only:
 169             # then add them [all or just one passed]
 170             if not tags:
 171                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 172                     PDCounterAuthor.objects.all(), \
 173                     PDCounterBook.objects.all())
 174
 175             for tag in tags:
 176                 if isinstance(tag, PDCounterAuthor):
 177                     doc = {
 178                         "tag_id": int(tag.id),
 179                         "tag_name": tag.name,
 180                         "tag_name_pl": tag.name,
 181                         "tag_category": 'pd_author',
 182                         "is_pdcounter": True
 183                         }
 184                 elif isinstance(tag, PDCounterBook):
 185                     doc = {
 186                         "tag_id": int(tag.id),
 187                         "tag_name": tag.title,
 188                         "tag_name_pl": tag.title,
 189                         "tag_category": 'pd_book',
 190                         "is_pdcounter": True
 191                         }
 192                 else:
 193                     doc = {
 194                         "tag_id": int(tag.id),
 195                         "tag_name": tag.name,
 196                         "tag_name_pl": tag.name,
 197                         "tag_category": tag.category,
 198                         "is_pdcounter": False
 199                         }
 200                 doc['uid'] = "tag%d" % tag.id
 201                 self.index.add(doc)
 202
 203     def create_book_doc(self, book):
 204         """
 205         Create a lucene document referring book id.
 206         """
 207         doc = {
 208             'book_id': int(book.id),
 209             }
 210         if book.parent is not None:
 211             doc["parent_id"] = int(book.parent.id)
 212         return doc
 213
 214     def remove_book(self, book_or_id, remove_snippets=True):
 215         """Removes a book from search index.
 216         book - Book instance."""
 217         if isinstance(book_or_id, catalogue.models.Book):
 218             book_id = book_or_id.id
 219         else:
 220             book_id = book_or_id
 221
 222         self.delete_query(self.index.Q(book_id=book_id))
 223
 224         if remove_snippets:
 225             snippets = Snippets(book_id)
 226             snippets.remove()
 227
 228     def index_book(self, book, book_info=None, overwrite=True):
 229         """
 230         Indexes the book.
 231         Creates a lucene document for extracted metadata
 232         and calls self.index_content() to index the contents of the book.
 233         """
 234         if overwrite:
 235             # we don't remove snippets, since they might be still needed by
 236             # threads using not reopened index
 237             self.remove_book(book, remove_snippets=False)
 238
 239         book_doc = self.create_book_doc(book)
 240         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 241         # let's not index it - it's only used for extracting publish date
 242         if 'source_name' in meta_fields:
 243             del meta_fields['source_name']
 244
 245         for n, f in meta_fields.items():
 246             book_doc[n] = f
 247
 248         book_doc['uid'] = "book%s" % book_doc['book_id']
 249         self.index.add(book_doc)
 250         del book_doc
 251
 252         self.index_content(book, book_fields={
 253             'title': meta_fields['title'],
 254             'authors': meta_fields['authors'],
 255             'published_date': meta_fields['published_date']})
 256
 257     master_tags = [
 258         'opowiadanie',
 259         'powiesc',
 260         'dramat_wierszowany_l',
 261         'dramat_wierszowany_lp',
 262         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 263         'wywiad',
 264         ]
 265
 266     ignore_content_tags = [
 267         'uwaga', 'extra',
 268         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 269         'didaskalia',
 270         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 271         ]
 272
 273     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 274
 275     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 276
 277     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 278
 279     def extract_metadata(self, book, book_info=None, dc_only=None):
 280         """
 281         Extract metadata from book and returns a map of fields keyed by fieldname
 282         """
 283         fields = {}
 284
 285         if book_info is None:
 286             book_info = dcparser.parse(open(book.xml_file.path))
 287
 288         fields['slug'] = book.slug
 289         fields['tags'] = [t.name  for t in book.tags]
 290         fields['is_book'] = True
 291
 292         # validator, name
 293         for field in dcparser.BookInfo.FIELDS:
 294             if dc_only and field.name not in dc_only:
 295                 continue
 296             if hasattr(book_info, field.name):
 297                 if not getattr(book_info, field.name):
 298                     continue
 299                 # since no type information is available, we use validator
 300                 type_indicator = field.validator
 301                 if type_indicator == dcparser.as_unicode:
 302                     s = getattr(book_info, field.name)
 303                     if field.multiple:
 304                         s = ', '.join(s)
 305                     fields[field.name] = s
 306                 elif type_indicator == dcparser.as_person:
 307                     p = getattr(book_info, field.name)
 308                     if isinstance(p, dcparser.Person):
 309                         persons = unicode(p)
 310                     else:
 311                         persons = ', '.join(map(unicode, p))
 312                     fields[field.name] = persons
 313                 elif type_indicator == dcparser.as_date:
 314                     dt = getattr(book_info, field.name)
 315                     fields[field.name] = dt
 316
 317         # get published date
 318         pd = None
 319         if hasattr(book_info, 'source_name') and book_info.source_name:
 320             match = self.published_date_re.search(book_info.source_name)
 321             if match is not None:
 322                 pd = str(match.groups()[0])
 323         if not pd: pd = ""
 324         fields["published_date"] = pd
 325
 326         return fields
 327
 328     # def add_gaps(self, fields, fieldname):
 329     #     """
 330     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 331     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 332     #     """
 333     #     def gap():
 334     #         while True:
 335     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 336     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 337
 338     def get_master(self, root):
 339         """
 340         Returns the first master tag from an etree.
 341         """
 342         for master in root.iter():
 343             if master.tag in self.master_tags:
 344                 return master
 345
 346     def index_content(self, book, book_fields={}):
 347         """
 348         Walks the book XML and extract content from it.
 349         Adds parts for each header tag and for each fragment.
 350         """
 351         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 352         root = wld.edoc.getroot()
 353
 354         master = self.get_master(root)
 355         if master is None:
 356             return []
 357
 358         def walker(node, ignore_tags=[]):
 359
 360             if node.tag not in ignore_tags:
 361                 yield node, None, None
 362                 if node.text is not None:
 363                     yield None, node.text, None
 364                 for child in list(node):
 365                     for b, t, e in walker(child):
 366                         yield b, t, e
 367                 yield None, None, node
 368
 369             if node.tail is not None:
 370                 yield None, node.tail, None
 371             return
 372
 373         def fix_format(text):
 374             #            separator = [u" ", u"\t", u".", u";", u","]
 375             if isinstance(text, list):
 376                 # need to join it first
 377                 text = filter(lambda s: s is not None, content)
 378                 text = u' '.join(text)
 379                 # for i in range(len(text)):
 380                 #     if i > 0:
 381                 #         if text[i][0] not in separator\
 382                 #             and text[i - 1][-1] not in separator:
 383                 #          text.insert(i, u" ")
 384
 385             return re.sub("(?m)/$", "", text)
 386
 387         def add_part(snippets, **fields):
 388             doc = self.create_book_doc(book)
 389             for n, v in book_fields.items():
 390                 doc[n] = v
 391
 392             doc['header_index'] = fields["header_index"]
 393             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 394             doc['header_type'] = fields['header_type']
 395
 396             doc['text'] = fields['text']
 397
 398             # snippets
 399             snip_pos = snippets.add(fields["text"])
 400
 401             doc['snippets_position'] = snip_pos[0]
 402             doc['snippets_length'] = snip_pos[1]
 403             if snippets.revision:
 404                 doc["snippets_revision"] = snippets.revision
 405
 406             if 'fragment_anchor' in fields:
 407                 doc["fragment_anchor"] = fields['fragment_anchor']
 408
 409             if 'themes' in fields:
 410                 doc['themes'] = fields['themes']
 411             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 412                                          doc['header_span'],
 413                                          doc.get('fragment_anchor',''))
 414             return doc
 415
 416         def give_me_utf8(s):
 417             if isinstance(s, unicode):
 418                 return s.encode('utf-8')
 419             else:
 420                 return s
 421
 422         fragments = {}
 423         snippets = Snippets(book.id).open('w')
 424         try:
 425             for header, position in zip(list(master), range(len(master))):
 426
 427                 if header.tag in self.skip_header_tags:
 428                     continue
 429                 if header.tag is etree.Comment:
 430                     continue
 431
 432                 # section content
 433                 content = []
 434                 footnote = []
 435
 436                 def all_content(text):
 437                     for frag in fragments.values():
 438                         frag['text'].append(text)
 439                     content.append(text)
 440                 handle_text = [all_content]
 441
 442                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 443                     # handle footnotes
 444                     if start is not None and start.tag in self.footnote_tags:
 445                         footnote = []
 446
 447                         def collect_footnote(t):
 448                             footnote.append(t)
 449
 450                         handle_text.append(collect_footnote)
 451                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 452                         handle_text.pop()
 453                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 454                                        text=u''.join(footnote),
 455                                        is_footnote=True)
 456
 457                         self.index.add(doc)
 458                         #print "@ footnote text: %s" % footnote
 459                         footnote = []
 460
 461                     # handle fragments and themes.
 462                     if start is not None and start.tag == 'begin':
 463                         fid = start.attrib['id'][1:]
 464                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 465
 466                     # themes for this fragment
 467                     elif start is not None and start.tag == 'motyw':
 468                         fid = start.attrib['id'][1:]
 469                         handle_text.append(None)
 470                         if start.text is not None:
 471                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 472                     elif end is not None and end.tag == 'motyw':
 473                         handle_text.pop()
 474
 475                     elif start is not None and start.tag == 'end':
 476                         fid = start.attrib['id'][1:]
 477                         if fid not in fragments:
 478                             continue  # a broken <end> node, skip it
 479                         frag = fragments[fid]
 480                         if frag['themes'] == []:
 481                             continue  # empty themes list.
 482                         del fragments[fid]
 483
 484                         doc = add_part(snippets,
 485                                        header_type=frag['start_header'],
 486                                        header_index=frag['start_section'],
 487                                        header_span=position - frag['start_section'] + 1,
 488                                        fragment_anchor=fid,
 489                                        text=fix_format(frag['text']),
 490                                        themes=frag['themes'])
 491                         #print '@ FRAG %s' % frag['content']
 492                         self.index.add(doc)
 493
 494                         # Collect content.
 495
 496                     if text is not None and handle_text is not []:
 497                         hdl = handle_text[-1]
 498                         if hdl is not None:
 499                             hdl(text)
 500
 501                         # in the end, add a section text.
 502                 doc = add_part(snippets, header_index=position,
 503                                header_type=header.tag, text=fix_format(content))
 504                 #print '@ CONTENT: %s' % fix_format(content)
 505
 506                 self.index.add(doc)
 507
 508         finally:
 509             snippets.close()
 510
 511
 512
 513 class SearchResult(object):
 514     def __init__(self, search, doc, how_found=None, snippets=None, searched=None, tokens_cache=None):
 515         if tokens_cache is None: tokens_cache = {}
 516
 517         if 'score' in doc:
 518             self._score = doc['score']
 519         else:
 520             self._score = 0
 521
 522         self.boost = 1.0
 523
 524         self._hits = []
 525         self._processed_hits = None  # processed hits
 526
 527         self.book_id = int(doc["book_id"])
 528
 529         pd = doc["published_date"]
 530         try:
 531             self.published_date = int(pd)
 532         except ValueError:
 533             self.published_date = 0
 534
 535         header_type = doc.get("header_type", None)
 536         # we have a content hit in some header of fragment
 537         if header_type is not None:
 538             sec = (header_type, int(doc["header_index"]))
 539             header_span = doc['header_span']
 540             header_span = header_span is not None and int(header_span) or 1
 541
 542             fragment = doc.get("fragment_anchor", None)
 543
 544             if snippets:
 545                 snippets = snippets.replace("/\n", "\n")
 546             hit = (sec + (header_span,), fragment, self._score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 547
 548             self._hits.append(hit)
 549
 550         self.search = search
 551         self.searched = searched
 552         self.tokens_cache = tokens_cache
 553
 554     @property
 555     def score(self):
 556         return self._score * self.boost
 557
 558     def merge(self, other):
 559         if self.book_id != other.book_id:
 560             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 561         self._hits += other._hits
 562         if other.score > self.score:
 563             self._score = other._score
 564         return self
 565
 566     def get_book(self):
 567         if hasattr(self, '_book'):
 568             return self._book
 569         return catalogue.models.Book.objects.get(id=self.book_id)
 570
 571     book = property(get_book)
 572
 573     @property
 574     def hits(self):
 575         if self._processed_hits is not None:
 576             return self._processed_hits
 577
 578         POSITION = 0
 579         FRAGMENT = 1
 580         POSITION_INDEX = 1
 581         POSITION_SPAN = 2
 582         SCORE = 2
 583         OTHER = 3
 584
 585         # to sections and fragments
 586         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 587
 588         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 589
 590         # sections not covered by fragments
 591         sect = filter(lambda s: 0 == len(filter(
 592             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 593             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 594             frags)), sect)
 595
 596         hits = []
 597
 598         def remove_duplicates(lst, keyfn, compare):
 599             els = {}
 600             for e in lst:
 601                 eif = keyfn(e)
 602                 if eif in els:
 603                     if compare(els[eif], e) >= 1:
 604                         continue
 605                 els[eif] = e
 606             return els.values()
 607
 608         # remove fragments with duplicated fid's and duplicated snippets
 609         frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
 610         frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
 611                                   lambda a, b: cmp(a[SCORE], b[SCORE]))
 612
 613         # remove duplicate sections
 614         sections = {}
 615
 616         for s in sect:
 617             si = s[POSITION][POSITION_INDEX]
 618             # skip existing
 619             if si in sections:
 620                 if sections[si]['score'] >= s[SCORE]:
 621                     continue
 622
 623             m = {'score': s[SCORE],
 624                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 625                  }
 626             m.update(s[OTHER])
 627             sections[si] = m
 628
 629         hits = sections.values()
 630
 631         for f in frags:
 632             try:
 633                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 634             except catalogue.models.Fragment.DoesNotExist:
 635                 # stale index
 636                 continue
 637
 638             # Figure out if we were searching for a token matching some word in theme name.
 639             themes = frag.tags.filter(category='theme')
 640             themes_hit = []
 641             if self.searched is not None:
 642                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 643                 for theme in themes:
 644                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 645                     for t in tokens:
 646                         if t in name_tokens:
 647                             if not theme in themes_hit:
 648                                 themes_hit.append(theme)
 649                             break
 650
 651             m = {'score': f[SCORE],
 652                  'fragment': frag,
 653                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 654                  'themes': themes,
 655                  'themes_hit': themes_hit
 656                  }
 657             m.update(f[OTHER])
 658             hits.append(m)
 659
 660         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 661
 662         self._processed_hits = hits
 663
 664         return hits
 665
 666     def __unicode__(self):
 667         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 668
 669     @staticmethod
 670     def aggregate(*result_lists):
 671         books = {}
 672         for rl in result_lists:
 673             for r in rl:
 674                 if r.book_id in books:
 675                     books[r.book_id].merge(r)
 676                 else:
 677                     books[r.book_id] = r
 678         return books.values()
 679
 680     def __cmp__(self, other):
 681         c = cmp(self.score, other.score)
 682         if c == 0:
 683             # this is inverted, because earlier date is better
 684             return cmp(other.published_date, self.published_date)
 685         else:
 686             return c
 687
 688
 689 class Hint(object):
 690     """
 691     Given some hint information (information we already know about)
 692     our search target - like author, title (specific book), epoch, genre, kind
 693     we can narrow down search using filters.
 694     """
 695     def __init__(self, search):
 696         """
 697         Accepts a Searcher instance.
 698         """
 699         self.search = search
 700         self.book_tags = {}
 701         self.part_tags = []
 702         self._books = []
 703
 704     def books(self, *books):
 705         """
 706         Give a hint that we search these books.
 707         """
 708         self._books = books
 709
 710     def tags(self, tags):
 711         """
 712         Give a hint that these Tag objects (a list of)
 713         is necessary.
 714         """
 715         for t in tags:
 716             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 717                 lst = self.book_tags.get(t.category, [])
 718                 lst.append(t)
 719                 self.book_tags[t.category] = lst
 720             if t.category in ['theme', 'theme_pl']:
 721                 self.part_tags.append(t)
 722
 723     def tag_filter(self, tags, field='tags'):
 724         """
 725         Given a lsit of tags and an optional field (but they are normally in tags field)
 726         returns a filter accepting only books with specific tags.
 727         """
 728         q = BooleanQuery()
 729
 730         for tag in tags:
 731             toks = self.search.get_tokens(tag.name, field=field)
 732             tag_phrase = PhraseQuery()
 733             for tok in toks:
 734                 tag_phrase.add(Term(field, tok))
 735             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 736
 737         return QueryWrapperFilter(q)
 738
 739     def book_filter(self):
 740         """
 741         Filters using book tags (all tag kinds except a theme)
 742         """
 743         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 744         if tags:
 745             return self.tag_filter(tags)
 746         else:
 747             return None
 748
 749     def part_filter(self):
 750         """
 751         This filter can be used to look for book parts.
 752         It filters on book id and/or themes.
 753         """
 754         fs = []
 755         if self.part_tags:
 756             fs.append(self.tag_filter(self.part_tags, field='themes'))
 757
 758         if self._books != []:
 759             bf = BooleanFilter()
 760             for b in self._books:
 761                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 762                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 763             fs.append(bf)
 764
 765         return Search.chain_filters(fs)
 766
 767     def should_search_for_book(self):
 768         return self._books == []
 769
 770     def just_search_in(self, all):
 771         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 772         some = []
 773         for field in all:
 774             if field == 'authors' and 'author' in self.book_tags:
 775                 continue
 776             if field == 'title' and self._books != []:
 777                 continue
 778             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 779                 continue
 780             some.append(field)
 781         return some
 782
 783
 784 class Search(SolrIndex):
 785     """
 786     Search facilities.
 787     """
 788     def __init__(self, default_field="text"):
 789         IndexStore.__init__(self)
 790         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 791         # self.analyzer = WLAnalyzer()
 792         reader = IndexReader.open(self.store, True)
 793         self.searcher = IndexSearcher(reader)
 794         self.parser = QueryParser(Version.LUCENE_34, default_field,
 795                                   self.analyzer)
 796
 797         self.parent_filter = TermsFilter()
 798         self.parent_filter.addTerm(Term("is_book", "true"))
 799         index_changed.connect(self.reopen)
 800
 801     def close(self):
 802         reader = self.searcher.getIndexReader()
 803         self.searcher.close()
 804         reader.close()
 805         super(Search, self).close()
 806         index_changed.disconnect(self.reopen)
 807
 808     def reopen(self, **unused):
 809         reader = self.searcher.getIndexReader()
 810         rdr = reader.reopen()
 811         if not rdr.equals(reader):
 812             log.debug('Reopening index')
 813             oldsearch = self.searcher
 814             self.searcher = IndexSearcher(rdr)
 815             oldsearch.close()
 816             reader.close()
 817
 818     def query(self, query):
 819         """Parse query in default Lucene Syntax. (for humans)
 820         """
 821         return self.parser.parse(query)
 822
 823     def simple_search(self, query, max_results=50):
 824         """Runs a query for books using lucene syntax. (for humans)
 825         Returns (books, total_hits)
 826         """
 827
 828         tops = self.searcher.search(self.query(query), max_results)
 829         bks = []
 830         for found in tops.scoreDocs:
 831             doc = self.searcher.doc(found.doc)
 832             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 833         return (bks, tops.totalHits)
 834
 835     def get_tokens(self, searched, field='text', cached=None):
 836         """returns tokens analyzed by a proper (for a field) analyzer
 837         argument can be: StringReader, string/unicode, or tokens. In the last case
 838         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 839         """
 840         if cached is not None and field in cached:
 841             return cached[field]
 842
 843         if isinstance(searched, str) or isinstance(searched, unicode):
 844             searched = StringReader(searched)
 845         elif isinstance(searched, list):
 846             return searched
 847
 848         searched.reset()
 849         tokens = self.analyzer.reusableTokenStream(field, searched)
 850         toks = []
 851         while tokens.incrementToken():
 852             cta = tokens.getAttribute(CharTermAttribute.class_)
 853             toks.append(cta.toString())
 854
 855         if cached is not None:
 856             cached[field] = toks
 857
 858         return toks
 859
 860     @staticmethod
 861     def fuzziness(fuzzy):
 862         """Helper method to sanitize fuzziness"""
 863         if not fuzzy:
 864             return None
 865         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 866             return fuzzy
 867         else:
 868             return 0.5
 869
 870     def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 871         """
 872         Return a PhraseQuery with a series of tokens.
 873         """
 874         if fuzzy:
 875             phrase = MultiPhraseQuery()
 876             for t in tokens:
 877                 term = Term(field, t)
 878                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 879                 fuzzterms = []
 880
 881                 while True:
 882                     ft = fuzzterm.term()
 883                     if ft:
 884                         fuzzterms.append(ft)
 885                     if not fuzzterm.next(): break
 886                 if fuzzterms:
 887                     phrase.add(JArray('object')(fuzzterms, Term))
 888                 else:
 889                     phrase.add(term)
 890         else:
 891             phrase = PhraseQuery()
 892             phrase.setSlop(slop)
 893             for t in tokens:
 894                 term = Term(field, t)
 895                 phrase.add(term)
 896         return phrase
 897
 898     @staticmethod
 899     def make_term_query(tokens, field='text', modal='BooleanClause.Occur.SHOULD XXX', fuzzy=False):
 900         """
 901         Returns term queries joined by boolean query.
 902         modal - applies to boolean query
 903         fuzzy - should the query by fuzzy.
 904         """
 905         q = BooleanQuery()
 906         for t in tokens:
 907             term = Term(field, t)
 908             if fuzzy:
 909                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 910             else:
 911                 term = TermQuery(term)
 912             q.add(BooleanClause(term, modal))
 913         return q
 914
 915     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 916                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 917         if filters is None: filters = []
 918         if tokens_cache is None: tokens_cache = {}
 919
 920         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 921
 922         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 923         if book:
 924             filters.append(self.term_filter(Term('is_book', 'true')))
 925         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 926
 927         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 928
 929     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 930                     filters=None, tokens_cache=None, boost=None, snippets=True):
 931         if filters is None: filters = []
 932         if tokens_cache is None: tokens_cache = {}
 933
 934         if book:
 935             filters.append(self.term_filter(Term('is_book', 'true')))
 936
 937         query = BooleanQuery()
 938
 939         for fld in fields:
 940             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 941
 942             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 943                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 944
 945         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 946
 947         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 948                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 949
 950     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 951         """
 952         Search for perfect book matches. Just see if the query matches with some author or title,
 953         taking hints into account.
 954         """
 955         fields_to_search = ['authors', 'title']
 956         only_in = None
 957         if hint:
 958             if not hint.should_search_for_book():
 959                 return []
 960             fields_to_search = hint.just_search_in(fields_to_search)
 961             only_in = hint.book_filter()
 962
 963         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 964
 965         books = []
 966         for q in qrys:
 967             top = self.searcher.search(q,
 968                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 969                 max_results)
 970             for found in top.scoreDocs:
 971                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
 972         return books
 973
 974     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 975         fields_to_search = ['tags', 'authors', 'title']
 976
 977         only_in = None
 978         if hint:
 979             if not hint.should_search_for_book():
 980                 return []
 981             fields_to_search = hint.just_search_in(fields_to_search)
 982             only_in = hint.book_filter()
 983
 984         tokens = self.get_tokens(searched, field='SIMPLE')
 985
 986         q = BooleanQuery()
 987
 988         for fld in fields_to_search:
 989             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 990                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 991
 992         books = []
 993         top = self.searcher.search(q,
 994                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 995             max_results)
 996         for found in top.scoreDocs:
 997             books.append(SearchResult(self, found, how_found="search_book"))
 998
 999         return books
1000
1001     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1002         """
1003         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1004         some part/fragment of the book.
1005         """
1006         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
1007
1008         flt = None
1009         if hint:
1010             flt = hint.part_filter()
1011
1012         books = []
1013         for q in qrys:
1014             top = self.searcher.search(q,
1015                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1016                                                            flt]),
1017                                        max_results)
1018             for found in top.scoreDocs:
1019                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1020
1021         return books
1022
1023     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1024         """
1025         Tries to use search terms to match different fields of book (or its parts).
1026         E.g. one word can be an author survey, another be a part of the title, and the rest
1027         are some words from third chapter.
1028         """
1029         if tokens_cache is None: tokens_cache = {}
1030         books = []
1031         only_in = None
1032
1033         if hint:
1034             only_in = hint.part_filter()
1035
1036         # content only query : themes x content
1037         q = BooleanQuery()
1038
1039         tokens_pl = self.get_tokens(searched, field='text', cached=tokens_cache)
1040         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1041
1042         # only search in themes when we do not already filter by themes
1043         if hint is None or hint.just_search_in(['themes']) != []:
1044             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1045                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
1046
1047         q.add(BooleanClause(self.make_term_query(tokens_pl, field='text',
1048                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1049
1050         topDocs = self.searcher.search(q, only_in, max_results)
1051         for found in topDocs.scoreDocs:
1052             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1053
1054         # query themes/content x author/title/tags
1055         q = BooleanQuery()
1056         in_content = BooleanQuery()
1057         in_meta = BooleanQuery()
1058
1059         for fld in ['themes_pl', 'text']:
1060             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1061
1062         for fld in ['tags', 'authors', 'title']:
1063             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1064
1065         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1066         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1067
1068         topDocs = self.searcher.search(q, only_in, max_results)
1069         for found in topDocs.scoreDocs:
1070             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1071
1072         return books
1073
1074     # def multisearch(self, query, max_results=50):
1075     #     """
1076     #     Search strategy:
1077     #     - (phrase) OR -> content
1078     #                   -> title
1079     #                   -> authors
1080     #     - (keywords)  -> authors
1081     #                   -> motyw
1082     #                   -> tags
1083     #                   -> content
1084     #     """
1085         # queryreader = StringReader(query)
1086         # tokens = self.get_tokens(queryreader)
1087
1088         # top_level = BooleanQuery()
1089         # Should = BooleanClause.Occur.SHOULD
1090
1091         # phrase_level = BooleanQuery()
1092         # phrase_level.setBoost(1.3)
1093
1094         # p_content = self.make_phrase(tokens, joined=True)
1095         # p_title = self.make_phrase(tokens, 'title')
1096         # p_author = self.make_phrase(tokens, 'author')
1097
1098         # phrase_level.add(BooleanClause(p_content, Should))
1099         # phrase_level.add(BooleanClause(p_title, Should))
1100         # phrase_level.add(BooleanClause(p_author, Should))
1101
1102         # kw_level = BooleanQuery()
1103
1104         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1105         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1106         # kw_level.add(j_themes, Should)
1107         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1108         # j_con = self.make_term_query(tokens, joined=True)
1109         # kw_level.add(j_con, Should)
1110
1111         # top_level.add(BooleanClause(phrase_level, Should))
1112         # top_level.add(BooleanClause(kw_level, Should))
1113
1114         # return None
1115
1116     def get_snippets(self, scoreDoc, query, field='text'):
1117         """
1118         Returns a snippet for found scoreDoc.
1119         """
1120         htmlFormatter = SimpleHTMLFormatter()
1121         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1122
1123         stored = self.searcher.doc(scoreDoc.doc)
1124
1125         position = stored.get('snippets_position')
1126         length = stored.get('snippets_length')
1127         if position is None or length is None:
1128             return None
1129         revision = stored.get('snippets_revision')
1130         if revision: revision = int(revision)
1131
1132         # locate content.
1133         book_id = int(stored.get('book_id'))
1134         snippets = Snippets(book_id, revision=revision)
1135
1136         try:
1137             snippets.open()
1138         except IOError, e:
1139             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1140             return []
1141
1142         try:
1143             try:
1144                 text = snippets.get((int(position),
1145                                      int(length)))
1146             finally:
1147                 snippets.close()
1148
1149             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1150             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
1151             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1152
1153         except Exception, e:
1154             e2 = e
1155             if hasattr(e, 'getJavaException'):
1156                 e2 = unicode(e.getJavaException())
1157             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1158                 e2)
1159         return snip
1160
1161     @staticmethod
1162     def enum_to_array(enum):
1163         """
1164         Converts a lucene TermEnum to array of Terms, suitable for
1165         addition to queries
1166         """
1167         terms = []
1168
1169         while True:
1170             t = enum.term()
1171             if t:
1172                 terms.append(t)
1173             if not enum.next(): break
1174
1175         if terms:
1176             return JArray('object')(terms, Term)
1177
1178     def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1179         """
1180         Search for Tag objects using query.
1181         """
1182         if not pdcounter:
1183             filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1184         tops = self.searcher.search(query, filt, max_results)
1185
1186         tags = []
1187         for found in tops.scoreDocs:
1188             doc = self.searcher.doc(found.doc)
1189             is_pdcounter = doc.get('is_pdcounter')
1190             category = doc.get('tag_category')
1191             try:
1192                 if is_pdcounter == 'true':
1193                     if category == 'pd_author':
1194                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1195                     elif category == 'pd_book':
1196                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1197                         tag.category = 'pd_book'  # make it look more lik a tag.
1198                     else:
1199                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1200                 else:
1201                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1202                     # don't add the pdcounter tag if same tag already exists
1203
1204                 tags.append(tag)
1205
1206             except catalogue.models.Tag.DoesNotExist: pass
1207             except PDCounterAuthor.DoesNotExist: pass
1208             except PDCounterBook.DoesNotExist: pass
1209
1210         log.debug('search_tags: %s' % tags)
1211
1212         return tags
1213
1214     def search_books(self, query, filt=None, max_results=10):
1215         """
1216         Searches for Book objects using query
1217         """
1218         bks = []
1219         tops = self.searcher.search(query, filt, max_results)
1220         for found in tops.scoreDocs:
1221             doc = self.searcher.doc(found.doc)
1222             try:
1223                 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1224             except catalogue.models.Book.DoesNotExist: pass
1225         return bks
1226
1227     def make_prefix_phrase(self, toks, field):
1228         q = MultiPhraseQuery()
1229         for i in range(len(toks)):
1230             t = Term(field, toks[i])
1231             if i == len(toks) - 1:
1232                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1233                 if pterms:
1234                     q.add(pterms)
1235                 else:
1236                     q.add(t)
1237             else:
1238                 q.add(t)
1239         return q
1240
1241     @staticmethod
1242     def term_filter(term, inverse=False):
1243         only_term = TermsFilter()
1244         only_term.addTerm(term)
1245
1246         if inverse:
1247             neg = BooleanFilter()
1248             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1249             only_term = neg
1250
1251         return only_term
1252
1253     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1254         """
1255         Return auto-complete hints for tags
1256         using prefix search.
1257         """
1258         toks = self.get_tokens(string, field='SIMPLE')
1259         top = BooleanQuery()
1260
1261         for field in ['tag_name', 'tag_name_pl']:
1262             if prefix:
1263                 q = self.make_prefix_phrase(toks, field)
1264             else:
1265                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1266             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1267
1268         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1269
1270         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1271
1272     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1273         """
1274         Returns auto-complete hints for book titles
1275         Because we do not index 'pseudo' title-tags.
1276         Prefix search.
1277         """
1278         toks = self.get_tokens(string, field='SIMPLE')
1279
1280         if prefix:
1281             q = self.make_prefix_phrase(toks, 'title')
1282         else:
1283             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1284
1285         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1286
1287     @staticmethod
1288     def chain_filters(filters, op='XXXChainedFilter.AND'):
1289         """
1290         Chains a filter list together
1291         """
1292         filters = filter(lambda x: x is not None, filters)
1293         if not filters or filters is []:
1294             return None
1295         chf = ChainedFilter(JArray('object')(filters, Filter), op)
1296         return chf
1297
1298     def filtered_categories(self, tags):
1299         """
1300         Return a list of tag categories, present in tags list.
1301         """
1302         cats = {}
1303         for t in tags:
1304             cats[t.category] = True
1305         return cats.keys()
1306
1307     def hint(self):
1308         return Hint(self)