apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21
  22 class SolrIndex(object):
  23     def __init__(self, mode=None):
  24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  25
  26
  27 class Snippets(object):
  28     """
  29     This class manages snippet files for indexed object (book)
  30     the snippets are concatenated together, and their positions and
  31     lengths are kept in lucene index fields.
  32     """
  33     SNIPPET_DIR = "snippets"
  34
  35     def __init__(self, book_id, revision=None):
  36         try:
  37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  38         except OSError as exc:
  39             if exc.errno == errno.EEXIST:
  40                 pass
  41             else: raise
  42         self.book_id = book_id
  43         self.revision = revision
  44         self.file = None
  45
  46     @property
  47     def path(self):
  48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  49         else: fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if not 'b' in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         self.file.close()
  96
  97     def remove(self):
  98         self.revision = None
  99         try:
 100             os.unlink(self.path)
 101             self.revision = 0
 102             while True:
 103                 self.revision += 1
 104                 os.unlink(self.path)
 105         except OSError:
 106             pass
 107
 108
 109 class Index(SolrIndex):
 110     """
 111     Class indexing books.
 112     """
 113     def __init__(self):
 114         super(Index, self).__init__(mode='rw')
 115
 116     def delete_query(self, *queries):
 117         """
 118         index.delete(queries=...) doesn't work, so let's reimplement it
 119         using deletion of list of uids.
 120         """
 121         uids = set()
 122         for q in queries:
 123             if isinstance(q, sunburnt.search.LuceneQuery):
 124                 q = self.index.query(q)
 125             q.field_limiter.update(['uid'])
 126             st = 0
 127             rows = 100
 128             while True:
 129                 ids = q.paginate(start=st, rows=rows).execute()
 130                 if not len(ids):
 131                     break
 132                 for res in ids:
 133                     uids.add(res['uid'])
 134                 st += rows
 135                 #        print "Will delete %s" % ','.join([x for x in uids])
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         remove_only = kw.get('remove_only', False)
 149         # first, remove tags from index.
 150         if tags:
 151             tag_qs = []
 152             for tag in tags:
 153                 q_id = self.index.Q(tag_id=tag.id)
 154
 155                 if isinstance(tag, PDCounterAuthor):
 156                     q_cat = self.index.Q(tag_category='pd_author')
 157                 elif isinstance(tag, PDCounterBook):
 158                     q_cat = self.index.Q(tag_category='pd_book')
 159                 else:
 160                     q_cat = self.index.Q(tag_category=tag.category)
 161
 162                 q_id_cat = self.index.Q(q_id & q_cat)
 163                 tag_qs.append(q_id_cat)
 164             self.delete_query(tag_qs)
 165         else:  # all
 166             q = self.index.Q(tag_id__any=True)
 167             self.delete_query(q)
 168
 169         if not remove_only:
 170             # then add them [all or just one passed]
 171             if not tags:
 172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 173                     PDCounterAuthor.objects.all(), \
 174                     PDCounterBook.objects.all())
 175
 176             for tag in tags:
 177                 if isinstance(tag, PDCounterAuthor):
 178                     doc = {
 179                         "tag_id": int(tag.id),
 180                         "tag_name": tag.name,
 181                         "tag_name_pl": tag.name,
 182                         "tag_category": 'pd_author',
 183                         "is_pdcounter": True,
 184                         "uid": "tag%d_pd_a" % tag.id
 185                         }
 186                 elif isinstance(tag, PDCounterBook):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.title,
 190                         "tag_name_pl": tag.title,
 191                         "tag_category": 'pd_book',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_b" % tag.id
 194                         }
 195                 else:
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.name,
 199                         "tag_name_pl": tag.name,
 200                         "tag_category": tag.category,
 201                         "is_pdcounter": False,
 202                         "uid": "tag%d" % tag.id
 203                         }
 204                 print "ADD 1 %s" % doc
 205                 self.index.add(doc)
 206
 207     def create_book_doc(self, book):
 208         """
 209         Create a lucene document referring book id.
 210         """
 211         doc = {
 212             'book_id': int(book.id),
 213             }
 214         if book.parent is not None:
 215             doc["parent_id"] = int(book.parent.id)
 216         return doc
 217
 218     def remove_book(self, book_or_id, remove_snippets=True):
 219         """Removes a book from search index.
 220         book - Book instance."""
 221         if isinstance(book_or_id, catalogue.models.Book):
 222             book_id = book_or_id.id
 223         else:
 224             book_id = book_or_id
 225
 226         self.delete_query(self.index.Q(book_id=book_id))
 227
 228         if remove_snippets:
 229             snippets = Snippets(book_id)
 230             snippets.remove()
 231
 232     def index_book(self, book, book_info=None, overwrite=True):
 233         """
 234         Indexes the book.
 235         Creates a lucene document for extracted metadata
 236         and calls self.index_content() to index the contents of the book.
 237         """
 238         if overwrite:
 239             # we don't remove snippets, since they might be still needed by
 240             # threads using not reopened index
 241             self.remove_book(book, remove_snippets=False)
 242
 243         book_doc = self.create_book_doc(book)
 244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         print "ADD 2 %s" % book_doc
 254         self.index.add(book_doc)
 255         del book_doc
 256         book_fields = {
 257             'title': meta_fields['title'],
 258             'authors': meta_fields['authors'],
 259             'published_date': meta_fields['published_date']
 260             }
 261         if 'translators' in meta_fields:
 262             book_fields['translators'] = meta_fields['translators']
 263
 264         self.index_content(book, book_fields=book_fields)
 265
 266     master_tags = [
 267         'opowiadanie',
 268         'powiesc',
 269         'dramat_wierszowany_l',
 270         'dramat_wierszowany_lp',
 271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 272         'wywiad',
 273         ]
 274
 275     ignore_content_tags = [
 276         'uwaga', 'extra',
 277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 278         'didaskalia',
 279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 280         ]
 281
 282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 283
 284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 285
 286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 287
 288     def extract_metadata(self, book, book_info=None, dc_only=None):
 289         """
 290         Extract metadata from book and returns a map of fields keyed by fieldname
 291         """
 292         fields = {}
 293
 294         if book_info is None:
 295             book_info = dcparser.parse(open(book.xml_file.path))
 296
 297         fields['slug'] = book.slug
 298         fields['tags'] = [t.name  for t in book.tags]
 299         fields['is_book'] = True
 300
 301         # validator, name
 302         for field in dcparser.BookInfo.FIELDS:
 303             if dc_only and field.name not in dc_only:
 304                 continue
 305             if hasattr(book_info, field.name):
 306                 if not getattr(book_info, field.name):
 307                     continue
 308                 # since no type information is available, we use validator
 309                 type_indicator = field.validator
 310                 if type_indicator == dcparser.as_unicode:
 311                     s = getattr(book_info, field.name)
 312                     if field.multiple:
 313                         s = ', '.join(s)
 314                     fields[field.name] = s
 315                 elif type_indicator == dcparser.as_person:
 316                     p = getattr(book_info, field.name)
 317                     if isinstance(p, dcparser.Person):
 318                         persons = unicode(p)
 319                     else:
 320                         persons = ', '.join(map(unicode, p))
 321                     fields[field.name] = persons
 322                 elif type_indicator == dcparser.as_date:
 323                     dt = getattr(book_info, field.name)
 324                     fields[field.name] = dt
 325
 326         # get published date
 327         pd = None
 328         if hasattr(book_info, 'source_name') and book_info.source_name:
 329             match = self.published_date_re.search(book_info.source_name)
 330             if match is not None:
 331                 pd = str(match.groups()[0])
 332         if not pd: pd = ""
 333         fields["published_date"] = pd
 334
 335         return fields
 336
 337     # def add_gaps(self, fields, fieldname):
 338     #     """
 339     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 340     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 341     #     """
 342     #     def gap():
 343     #         while True:
 344     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 345     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 346
 347     def get_master(self, root):
 348         """
 349         Returns the first master tag from an etree.
 350         """
 351         for master in root.iter():
 352             if master.tag in self.master_tags:
 353                 return master
 354
 355     def index_content(self, book, book_fields={}):
 356         """
 357         Walks the book XML and extract content from it.
 358         Adds parts for each header tag and for each fragment.
 359         """
 360         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 361         root = wld.edoc.getroot()
 362
 363         master = self.get_master(root)
 364         if master is None:
 365             return []
 366
 367         def walker(node, ignore_tags=[]):
 368
 369             if node.tag not in ignore_tags:
 370                 yield node, None, None
 371                 if node.text is not None:
 372                     yield None, node.text, None
 373                 for child in list(node):
 374                     for b, t, e in walker(child):
 375                         yield b, t, e
 376                 yield None, None, node
 377
 378             if node.tail is not None:
 379                 yield None, node.tail, None
 380             return
 381
 382         def fix_format(text):
 383             #            separator = [u" ", u"\t", u".", u";", u","]
 384             if isinstance(text, list):
 385                 # need to join it first
 386                 text = filter(lambda s: s is not None, content)
 387                 text = u' '.join(text)
 388                 # for i in range(len(text)):
 389                 #     if i > 0:
 390                 #         if text[i][0] not in separator\
 391                 #             and text[i - 1][-1] not in separator:
 392                 #          text.insert(i, u" ")
 393
 394             return re.sub("(?m)/$", "", text)
 395
 396         def add_part(snippets, **fields):
 397             doc = self.create_book_doc(book)
 398             for n, v in book_fields.items():
 399                 doc[n] = v
 400
 401             doc['header_index'] = fields["header_index"]
 402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 403             doc['header_type'] = fields['header_type']
 404
 405             doc['text'] = fields['text']
 406
 407             # snippets
 408             snip_pos = snippets.add(fields["text"])
 409
 410             doc['snippets_position'] = snip_pos[0]
 411             doc['snippets_length'] = snip_pos[1]
 412             if snippets.revision:
 413                 doc["snippets_revision"] = snippets.revision
 414
 415             if 'fragment_anchor' in fields:
 416                 doc["fragment_anchor"] = fields['fragment_anchor']
 417
 418             if 'themes' in fields:
 419                 doc['themes'] = fields['themes']
 420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 421                                          doc['header_span'],
 422                                          doc.get('fragment_anchor', ''))
 423             return doc
 424
 425         def give_me_utf8(s):
 426             if isinstance(s, unicode):
 427                 return s.encode('utf-8')
 428             else:
 429                 return s
 430
 431         fragments = {}
 432         snippets = Snippets(book.id).open('w')
 433         try:
 434             for header, position in zip(list(master), range(len(master))):
 435
 436                 if header.tag in self.skip_header_tags:
 437                     continue
 438                 if header.tag is etree.Comment:
 439                     continue
 440
 441                 # section content
 442                 content = []
 443                 footnote = []
 444
 445                 def all_content(text):
 446                     for frag in fragments.values():
 447                         frag['text'].append(text)
 448                     content.append(text)
 449                 handle_text = [all_content]
 450
 451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 452                     # handle footnotes
 453                     if start is not None and start.tag in self.footnote_tags:
 454                         footnote = []
 455
 456                         def collect_footnote(t):
 457                             footnote.append(t)
 458
 459                         handle_text.append(collect_footnote)
 460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 461                         handle_text.pop()
 462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 463                                        text=u''.join(footnote),
 464                                        is_footnote=True)
 465                         print "ADD 3 %s" % doc
 466                         self.index.add(doc)
 467                         #print "@ footnote text: %s" % footnote
 468                         footnote = []
 469
 470                     # handle fragments and themes.
 471                     if start is not None and start.tag == 'begin':
 472                         fid = start.attrib['id'][1:]
 473                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 474
 475                     # themes for this fragment
 476                     elif start is not None and start.tag == 'motyw':
 477                         fid = start.attrib['id'][1:]
 478                         handle_text.append(None)
 479                         if start.text is not None:
 480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 481                     elif end is not None and end.tag == 'motyw':
 482                         handle_text.pop()
 483
 484                     elif start is not None and start.tag == 'end':
 485                         fid = start.attrib['id'][1:]
 486                         if fid not in fragments:
 487                             continue  # a broken <end> node, skip it
 488                         frag = fragments[fid]
 489                         if frag['themes'] == []:
 490                             continue  # empty themes list.
 491                         del fragments[fid]
 492
 493                         doc = add_part(snippets,
 494                                        header_type=frag['start_header'],
 495                                        header_index=frag['start_section'],
 496                                        header_span=position - frag['start_section'] + 1,
 497                                        fragment_anchor=fid,
 498                                        text=fix_format(frag['text']),
 499                                        themes=frag['themes'])
 500                         #print '@ FRAG %s' % frag['content']
 501                         print "ADD 4 %s" % doc
 502                         self.index.add(doc)
 503
 504                         # Collect content.
 505
 506                     if text is not None and handle_text is not []:
 507                         hdl = handle_text[-1]
 508                         if hdl is not None:
 509                             hdl(text)
 510
 511                         # in the end, add a section text.
 512                 doc = add_part(snippets, header_index=position,
 513                                header_type=header.tag, text=fix_format(content))
 514                 #print '@ CONTENT: %s' % fix_format(content)
 515
 516                 print "ADD 5 %s" % doc
 517                 self.index.add(doc)
 518
 519         finally:
 520             snippets.close()
 521
 522
 523 class SearchResult(object):
 524     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 525         #        self.search = search
 526         self.boost = 1.0
 527         self._hits = []
 528         self._processed_hits = None  # processed hits
 529         self.snippets = []
 530         self.query_terms = query_terms
 531
 532         if 'score' in doc:
 533             self._score = doc['score']
 534         else:
 535             self._score = 0
 536
 537         self.book_id = int(doc["book_id"])
 538
 539         try:
 540             self.published_date = int(doc.get("published_date"))
 541         except ValueError:
 542             self.published_date = 0
 543
 544         # content hits
 545         header_type = doc.get("header_type", None)
 546         # we have a content hit in some header of fragment
 547         if header_type is not None:
 548             sec = (header_type, int(doc["header_index"]))
 549             header_span = doc['header_span']
 550             header_span = header_span is not None and int(header_span) or 1
 551             fragment = doc.get("fragment_anchor", None)
 552             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 553             snippets_rev = doc['snippets_revision']
 554
 555             hit = (sec + (header_span,), fragment, self._score, {
 556                 'how_found': how_found,
 557                 'snippets_pos': snippets_pos,
 558                 'snippets_revision': snippets_rev,
 559                 'themes': doc.get('themes', []),
 560                 'themes_pl': doc.get('themes_pl', [])
 561                 })
 562
 563             self._hits.append(hit)
 564
 565     def __unicode__(self):
 566         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 567             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 568
 569     def __str__(self):
 570         return unicode(self).encode('utf-8')
 571
 572     @property
 573     def score(self):
 574         return self._score * self.boost
 575
 576     def merge(self, other):
 577         if self.book_id != other.book_id:
 578             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 579         self._hits += other._hits
 580         if other.score > self.score:
 581             self._score = other._score
 582         return self
 583
 584     def get_book(self):
 585         if hasattr(self, '_book'):
 586             return self._book
 587         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 588         return self._book
 589
 590     book = property(get_book)
 591
 592     POSITION = 0
 593     FRAGMENT = 1
 594     POSITION_INDEX = 1
 595     POSITION_SPAN = 2
 596     SCORE = 2
 597     OTHER = 3
 598
 599     @property
 600     def hits(self):
 601         if self._processed_hits is not None:
 602             return self._processed_hits
 603
 604         # to sections and fragments
 605         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 606
 607         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 608
 609         # sections not covered by fragments
 610         sect = filter(lambda s: 0 == len(filter(
 611             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 612             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 613             frags)), sect)
 614
 615         hits = []
 616
 617         def remove_duplicates(lst, keyfn, compare):
 618             els = {}
 619             for e in lst:
 620                 eif = keyfn(e)
 621                 if eif in els:
 622                     if compare(els[eif], e) >= 1:
 623                         continue
 624                 els[eif] = e
 625             return els.values()
 626
 627         # remove fragments with duplicated fid's and duplicated snippets
 628         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 629         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 630         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 631
 632         # remove duplicate sections
 633         sections = {}
 634
 635         for s in sect:
 636             si = s[self.POSITION][self.POSITION_INDEX]
 637             # skip existing
 638             if si in sections:
 639                 if sections[si]['score'] >= s[self.SCORE]:
 640                     continue
 641
 642             m = {'score': s[self.SCORE],
 643                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 644                  }
 645             m.update(s[self.OTHER])
 646             sections[si] = m
 647
 648         hits = sections.values()
 649
 650         for f in frags:
 651             try:
 652                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 653             except catalogue.models.Fragment.DoesNotExist:
 654                 # stale index
 655                 continue
 656             print f
 657             # Figure out if we were searching for a token matching some word in theme name.
 658             themes = frag.tags.filter(category='theme')
 659             themes_hit = set()
 660             if self.query_terms is not None:
 661                 for i in range(0, len(f[self.OTHER]['themes'])):
 662                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 663                     tms = map(unicode.lower, tms)
 664                     for qt in self.query_terms:
 665                         if qt in tms:
 666                             themes_hit.add(f[self.OTHER]['themes'][i])
 667                             break
 668
 669             def theme_by_name(n):
 670                 th = filter(lambda t: t.name == n, themes)
 671                 if th:
 672                     return th[0]
 673                 else:
 674                     return None
 675             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 676
 677             m = {'score': f[self.SCORE],
 678                  'fragment': frag,
 679                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 680                  'themes': themes,
 681                  'themes_hit': themes_hit
 682                  }
 683             m.update(f[self.OTHER])
 684             hits.append(m)
 685
 686         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 687
 688         self._processed_hits = hits
 689
 690         return hits
 691
 692     @staticmethod
 693     def aggregate(*result_lists):
 694         books = {}
 695         for rl in result_lists:
 696             for r in rl:
 697                 if r.book_id in books:
 698                     books[r.book_id].merge(r)
 699                 else:
 700                     books[r.book_id] = r
 701         return books.values()
 702
 703     def __cmp__(self, other):
 704         c = cmp(self.score, other.score)
 705         if c == 0:
 706             # this is inverted, because earlier date is better
 707             return cmp(other.published_date, self.published_date)
 708         else:
 709             return c
 710
 711     def __len__(self):
 712         return len(self.hits)
 713
 714     def snippet_pos(self, idx=0):
 715         return self.hits[idx]['snippets_pos']
 716
 717     def snippet_revision(self, idx=0):
 718         try:
 719             return self.hits[idx]['snippets_revision']
 720         except:
 721             return None
 722
 723
 724 class Search(SolrIndex):
 725     """
 726     Search facilities.
 727     """
 728     def __init__(self, default_field="text"):
 729         super(Search, self).__init__(mode='r')
 730
 731     # def get_tokens(self, searched, field='text', cached=None):
 732     #     """returns tokens analyzed by a proper (for a field) analyzer
 733     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 734     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 735     #     """
 736     #     if cached is not None and field in cached:
 737     #         return cached[field]
 738
 739     #     if isinstance(searched, str) or isinstance(searched, unicode):
 740     #         searched = StringReader(searched)
 741     #     elif isinstance(searched, list):
 742     #         return searched
 743
 744     #     searched.reset()
 745     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 746     #     toks = []
 747     #     while tokens.incrementToken():
 748     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 749     #         toks.append(cta.toString())
 750
 751     #     if cached is not None:
 752     #         cached[field] = toks
 753
 754     #     return toks
 755
 756     # @staticmethod
 757     # def fuzziness(fuzzy):
 758     #     """Helper method to sanitize fuzziness"""
 759     #     if not fuzzy:
 760     #         return None
 761     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 762     #         return fuzzy
 763     #     else:
 764     #         return 0.5
 765
 766     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 767     #     """
 768     #     Return a PhraseQuery with a series of tokens.
 769     #     """
 770     #     if fuzzy:
 771     #         phrase = MultiPhraseQuery()
 772     #         for t in tokens:
 773     #             term = Term(field, t)
 774     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 775     #             fuzzterms = []
 776
 777     #             while True:
 778     #                 ft = fuzzterm.term()
 779     #                 if ft:
 780     #                     fuzzterms.append(ft)
 781     #                 if not fuzzterm.next(): break
 782     #             if fuzzterms:
 783     #                 phrase.add(JArray('object')(fuzzterms, Term))
 784     #             else:
 785     #                 phrase.add(term)
 786     #     else:
 787     #         phrase = PhraseQuery()
 788     #         phrase.setSlop(slop)
 789     #         for t in tokens:
 790     #             term = Term(field, t)
 791     #             phrase.add(term)
 792     #     return phrase
 793
 794     def make_term_query(self, query, field='text', modal=operator.or_):
 795         """
 796         Returns term queries joined by boolean query.
 797         modal - applies to boolean query
 798         fuzzy - should the query by fuzzy.
 799         """
 800         q = self.index.Q()
 801         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 802                         query.split(r" ")), q)
 803
 804         return q
 805
 806     def search_phrase(self, searched, field='text', book=False,
 807                       filters=None,
 808                       snippets=False):
 809         if filters is None: filters = []
 810         if book: filters.append(self.index.Q(is_book=True))
 811
 812         q = self.index.query(**{field: searched})
 813         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 814         res = q.execute()
 815         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 816
 817     def search_some(self, searched, fields, book=True,
 818                     filters=None, snippets=True, query_terms=None):
 819         assert isinstance(fields, list)
 820         if filters is None: filters = []
 821         if book: filters.append(self.index.Q(is_book=True))
 822
 823         query = self.index.Q()
 824
 825         for fld in fields:
 826             query = self.index.Q(query | self.make_term_query(searched, fld))
 827
 828         query = self.index.query(query)
 829         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 830         res = query.execute()
 831         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 832
 833     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 834     #     """
 835     #     Search for perfect book matches. Just see if the query matches with some author or title,
 836     #     taking hints into account.
 837     #     """
 838     #     fields_to_search = ['authors', 'title']
 839     #     only_in = None
 840     #     if hint:
 841     #         if not hint.should_search_for_book():
 842     #             return []
 843     #         fields_to_search = hint.just_search_in(fields_to_search)
 844     #         only_in = hint.book_filter()
 845
 846     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 847
 848     #     books = []
 849     #     for q in qrys:
 850     #         top = self.searcher.search(q,
 851     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 852     #             max_results)
 853     #         for found in top.scoreDocs:
 854     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 855     #     return books
 856
 857     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 858     #     fields_to_search = ['tags', 'authors', 'title']
 859
 860     #     only_in = None
 861     #     if hint:
 862     #         if not hint.should_search_for_book():
 863     #             return []
 864     #         fields_to_search = hint.just_search_in(fields_to_search)
 865     #         only_in = hint.book_filter()
 866
 867     #     tokens = self.get_tokens(searched, field='SIMPLE')
 868
 869     #     q = BooleanQuery()
 870
 871     #     for fld in fields_to_search:
 872     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 873     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 874
 875     #     books = []
 876     #     top = self.searcher.search(q,
 877     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 878     #         max_results)
 879     #     for found in top.scoreDocs:
 880     #         books.append(SearchResult(self, found, how_found="search_book"))
 881
 882     #     return books
 883
 884     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 885     #     """
 886     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 887     #     some part/fragment of the book.
 888     #     """
 889     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 890
 891     #     flt = None
 892     #     if hint:
 893     #         flt = hint.part_filter()
 894
 895     #     books = []
 896     #     for q in qrys:
 897     #         top = self.searcher.search(q,
 898     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 899     #                                                        flt]),
 900     #                                    max_results)
 901     #         for found in top.scoreDocs:
 902     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 903
 904     #     return books
 905
 906     def search_everywhere(self, searched, query_terms=None):
 907         """
 908         Tries to use search terms to match different fields of book (or its parts).
 909         E.g. one word can be an author survey, another be a part of the title, and the rest
 910         are some words from third chapter.
 911         """
 912         books = []
 913         # content only query : themes x content
 914         q = self.make_term_query(searched, 'text')
 915         q_themes = self.make_term_query(searched, 'themes_pl')
 916
 917         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 918         res = query.execute()
 919
 920         for found in res:
 921             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 922
 923         # query themes/content x author/title/tags
 924         in_content = self.index.Q()
 925         in_meta = self.index.Q()
 926
 927         for fld in ['themes_pl', 'text']:
 928             in_content |= self.make_term_query(searched, field=fld)
 929
 930         for fld in ['tags', 'authors', 'title']:
 931             in_meta |= self.make_term_query(searched, field=fld)
 932
 933         q = in_content & in_meta
 934         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 935
 936         for found in res:
 937             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 938
 939         return books
 940
 941     def get_snippets(self, searchresult, query, field='text', num=1):
 942         """
 943         Returns a snippet for found scoreDoc.
 944         """
 945         maxnum = len(searchresult)
 946         if num is None or num < 0 or num > maxnum:
 947             num = maxnum
 948         book_id = searchresult.book_id
 949         revision = searchresult.snippet_revision()
 950         snippets = Snippets(book_id, revision=revision)
 951         snips = [None] * maxnum
 952         try:
 953             snippets.open()
 954             idx = 0
 955             while idx < maxnum and num > 0:
 956                 position, length = searchresult.snippet_pos(idx)
 957                 if position is None or length is None:
 958                     continue
 959                 text = snippets.get((int(position),
 960                                      int(length)))
 961                 print "== %s -- %s ==" % (query, text)
 962                 snip = self.index.highlight(text=text, field=field, q=query)
 963                 snips[idx] = snip
 964                 if snip:
 965                     num -= 1
 966                 idx += 1
 967
 968         except IOError, e:
 969             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 970             return []
 971         finally:
 972             snippets.close()
 973
 974             # remove verse end markers..
 975         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 976
 977         searchresult.snippets = snips
 978         return snips
 979
 980     def hint_tags(self, query, pdcounter=True, prefix=True):
 981         """
 982         Return auto-complete hints for tags
 983         using prefix search.
 984         """
 985         q = self.index.Q()
 986         query = query.strip()
 987         for field in ['tag_name', 'tag_name_pl']:
 988             if prefix:
 989                 q |= self.index.Q(**{field: query + "*"})
 990             else:
 991                 q |= self.make_term_query(query, field=field)
 992         qu = self.index.query(q).exclude(tag_category="book")
 993
 994         return self.search_tags(qu, pdcounter=pdcounter)
 995
 996     def search_tags(self, query, filters=None, pdcounter=False):
 997         """
 998         Search for Tag objects using query.
 999         """
1000         if not filters: filters = []
1001         if not pdcounter:
1002             filters.append(~self.index.Q(is_pdcounter=True))
1003         res = self.apply_filters(query, filters).execute()
1004
1005         tags = []
1006         for doc in res:
1007             is_pdcounter = doc.get('is_pdcounter', False)
1008             category = doc.get('tag_category')
1009             try:
1010                 if is_pdcounter == True:
1011                     if category == 'pd_author':
1012                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1013                     elif category == 'pd_book':
1014                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1015                         tag.category = 'pd_book'  # make it look more lik a tag.
1016                     else:
1017                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1018                 else:
1019                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1020                     # don't add the pdcounter tag if same tag already exists
1021
1022                 tags.append(tag)
1023
1024             except catalogue.models.Tag.DoesNotExist: pass
1025             except PDCounterAuthor.DoesNotExist: pass
1026             except PDCounterBook.DoesNotExist: pass
1027
1028         log.debug('search_tags: %s' % tags)
1029
1030         return tags
1031
1032     def hint_books(self, query, prefix=True):
1033         """
1034         Returns auto-complete hints for book titles
1035         Because we do not index 'pseudo' title-tags.
1036         Prefix search.
1037         """
1038         q = self.index.Q()
1039         query = query.strip()
1040         if prefix:
1041             q |= self.index.Q(title=query + "*")
1042         else:
1043             q |= self.make_term_query(query, field='title')
1044         qu = self.index.query(q)
1045         only_books = self.index.Q(is_book=True)
1046         return self.search_books(qu, [only_books])
1047
1048     def search_books(self, query, filters=None, max_results=10):
1049         """
1050         Searches for Book objects using query
1051         """
1052         bks = []
1053         res = self.apply_filters(query, filters).field_limit(['book_id'])
1054         for r in res:
1055             try:
1056                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1057             except catalogue.models.Book.DoesNotExist: pass
1058         return bks
1059
1060     # def make_prefix_phrase(self, toks, field):
1061     #     q = MultiPhraseQuery()
1062     #     for i in range(len(toks)):
1063     #         t = Term(field, toks[i])
1064     #         if i == len(toks) - 1:
1065     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1066     #             if pterms:
1067     #                 q.add(pterms)
1068     #             else:
1069     #                 q.add(t)
1070     #         else:
1071     #             q.add(t)
1072     #     return q
1073
1074     # @staticmethod
1075     # def term_filter(term, inverse=False):
1076     #     only_term = TermsFilter()
1077     #     only_term.addTerm(term)
1078
1079     #     if inverse:
1080     #         neg = BooleanFilter()
1081     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1082     #         only_term = neg
1083
1084     #     return only_term
1085
1086
1087
1088     @staticmethod
1089     def apply_filters(query, filters):
1090         """
1091         Apply filters to a query
1092         """
1093         if filters is None: filters = []
1094         filters = filter(lambda x: x is not None, filters)
1095         for f in filters:
1096             query = query.query(f)
1097         return query
1098
1099     # def filtered_categories(self, tags):
1100     #     """
1101     #     Return a list of tag categories, present in tags list.
1102     #     """
1103     #     cats = {}
1104     #     for t in tags:
1105     #         cats[t.category] = True
1106     #     return cats.keys()
1107
1108     # def hint(self):
1109     #     return Hint(self)