apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21 log = logging.getLogger('search')
  22
  23 class SolrIndex(object):
  24     def __init__(self, mode=None):
  25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  26
  27
  28 class Snippets(object):
  29     """
  30     This class manages snippet files for indexed object (book)
  31     the snippets are concatenated together, and their positions and
  32     lengths are kept in lucene index fields.
  33     """
  34     SNIPPET_DIR = "snippets"
  35
  36     def __init__(self, book_id, revision=None):
  37         try:
  38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         except OSError as exc:
  40             if exc.errno == errno.EEXIST:
  41                 pass
  42             else: raise
  43         self.book_id = book_id
  44         self.revision = revision
  45         self.file = None
  46
  47     @property
  48     def path(self):
  49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  50         else: fn = "%d" % self.book_id
  51
  52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  53
  54     def open(self, mode='r'):
  55         """
  56         Open the snippet file. Call .close() afterwards.
  57         """
  58         if not 'b' in mode:
  59             mode += 'b'
  60
  61         if 'w' in mode:
  62             if os.path.exists(self.path):
  63                 self.revision = 1
  64                 while True:
  65                     if not os.path.exists(self.path):
  66                         break
  67                     self.revision += 1
  68
  69         self.file = open(self.path, mode)
  70         self.position = 0
  71         return self
  72
  73     def add(self, snippet):
  74         """
  75         Append a snippet (unicode) to the snippet file.
  76         Return a (position, length) tuple
  77         """
  78         txt = snippet.encode('utf-8')
  79         l = len(txt)
  80         self.file.write(txt)
  81         pos = (self.position, l)
  82         self.position += l
  83         return pos
  84
  85     def get(self, pos):
  86         """
  87         Given a tuple of (position, length) return an unicode
  88         of the snippet stored there.
  89         """
  90         self.file.seek(pos[0], 0)
  91         txt = self.file.read(pos[1]).decode('utf-8')
  92         return txt
  93
  94     def close(self):
  95         """Close snippet file"""
  96         self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136                 #        print "Will delete %s" % ','.join([x for x in uids])
 137         if uids:
 138             self.index.delete(uids)
 139             return True
 140         else:
 141             return False
 142
 143     def index_tags(self, *tags, **kw):
 144         """
 145         Re-index global tag list.
 146         Removes all tags from index, then index them again.
 147         Indexed fields include: id, name (with and without polish stems), category
 148         """
 149         remove_only = kw.get('remove_only', False)
 150         # first, remove tags from index.
 151         if tags:
 152             tag_qs = []
 153             for tag in tags:
 154                 q_id = self.index.Q(tag_id=tag.id)
 155
 156                 if isinstance(tag, PDCounterAuthor):
 157                     q_cat = self.index.Q(tag_category='pd_author')
 158                 elif isinstance(tag, PDCounterBook):
 159                     q_cat = self.index.Q(tag_category='pd_book')
 160                 else:
 161                     q_cat = self.index.Q(tag_category=tag.category)
 162
 163                 q_id_cat = self.index.Q(q_id & q_cat)
 164                 tag_qs.append(q_id_cat)
 165             self.delete_query(tag_qs)
 166         else:  # all
 167             q = self.index.Q(tag_id__any=True)
 168             self.delete_query(q)
 169
 170         if not remove_only:
 171             # then add them [all or just one passed]
 172             if not tags:
 173                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 174                     PDCounterAuthor.objects.all(), \
 175                     PDCounterBook.objects.all())
 176
 177             for tag in tags:
 178                 if isinstance(tag, PDCounterAuthor):
 179                     doc = {
 180                         "tag_id": int(tag.id),
 181                         "tag_name": tag.name,
 182                         "tag_name_pl": tag.name,
 183                         "tag_category": 'pd_author',
 184                         "is_pdcounter": True,
 185                         "uid": "tag%d_pd_a" % tag.id
 186                         }
 187                 elif isinstance(tag, PDCounterBook):
 188                     doc = {
 189                         "tag_id": int(tag.id),
 190                         "tag_name": tag.title,
 191                         "tag_name_pl": tag.title,
 192                         "tag_category": 'pd_book',
 193                         "is_pdcounter": True,
 194                         "uid": "tag%d_pd_b" % tag.id
 195                         }
 196                 else:
 197                     doc = {
 198                         "tag_id": int(tag.id),
 199                         "tag_name": tag.name,
 200                         "tag_name_pl": tag.name,
 201                         "tag_category": tag.category,
 202                         "is_pdcounter": False,
 203                         "uid": "tag%d" % tag.id
 204                         }
 205                 self.index.add(doc)
 206
 207     def create_book_doc(self, book):
 208         """
 209         Create a lucene document referring book id.
 210         """
 211         doc = {
 212             'book_id': int(book.id),
 213             }
 214         if book.parent is not None:
 215             doc["parent_id"] = int(book.parent.id)
 216         return doc
 217
 218     def remove_book(self, book_or_id, remove_snippets=True):
 219         """Removes a book from search index.
 220         book - Book instance."""
 221         if isinstance(book_or_id, catalogue.models.Book):
 222             book_id = book_or_id.id
 223         else:
 224             book_id = book_or_id
 225
 226         self.delete_query(self.index.Q(book_id=book_id))
 227
 228         if remove_snippets:
 229             snippets = Snippets(book_id)
 230             snippets.remove()
 231
 232     def index_book(self, book, book_info=None, overwrite=True):
 233         """
 234         Indexes the book.
 235         Creates a lucene document for extracted metadata
 236         and calls self.index_content() to index the contents of the book.
 237         """
 238         if overwrite:
 239             # we don't remove snippets, since they might be still needed by
 240             # threads using not reopened index
 241             self.remove_book(book, remove_snippets=False)
 242
 243         book_doc = self.create_book_doc(book)
 244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         self.index.add(book_doc)
 254         del book_doc
 255         book_fields = {
 256             'title': meta_fields['title'],
 257             'authors': meta_fields['authors'],
 258             'published_date': meta_fields['published_date']
 259             }
 260
 261         if 'translators' in meta_fields:
 262             book_fields['translators'] = meta_fields['translators']
 263
 264         self.index_content(book, book_fields=book_fields)
 265
 266     master_tags = [
 267         'opowiadanie',
 268         'powiesc',
 269         'dramat_wierszowany_l',
 270         'dramat_wierszowany_lp',
 271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 272         'wywiad',
 273         ]
 274
 275     ignore_content_tags = [
 276         'uwaga', 'extra',
 277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 278         'didaskalia',
 279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 280         ]
 281
 282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 283
 284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 285
 286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 287
 288     def extract_metadata(self, book, book_info=None, dc_only=None):
 289         """
 290         Extract metadata from book and returns a map of fields keyed by fieldname
 291         """
 292         fields = {}
 293
 294         if book_info is None:
 295             book_info = dcparser.parse(open(book.xml_file.path))
 296
 297         fields['slug'] = book.slug
 298         fields['tags'] = [t.name  for t in book.tags]
 299         fields['is_book'] = True
 300
 301         # validator, name
 302         for field in dcparser.BookInfo.FIELDS:
 303             if dc_only and field.name not in dc_only:
 304                 continue
 305             if hasattr(book_info, field.name):
 306                 if not getattr(book_info, field.name):
 307                     continue
 308                 # since no type information is available, we use validator
 309                 type_indicator = field.validator
 310                 if type_indicator == dcparser.as_unicode:
 311                     s = getattr(book_info, field.name)
 312                     if field.multiple:
 313                         s = ', '.join(s)
 314                     fields[field.name] = s
 315                 elif type_indicator == dcparser.as_person:
 316                     p = getattr(book_info, field.name)
 317                     if isinstance(p, dcparser.Person):
 318                         persons = unicode(p)
 319                     else:
 320                         persons = ', '.join(map(unicode, p))
 321                     fields[field.name] = persons
 322                 elif type_indicator == dcparser.as_date:
 323                     dt = getattr(book_info, field.name)
 324                     fields[field.name] = dt
 325
 326         # get published date
 327         pd = None
 328         if hasattr(book_info, 'source_name') and book_info.source_name:
 329             match = self.published_date_re.search(book_info.source_name)
 330             if match is not None:
 331                 pd = str(match.groups()[0])
 332         if not pd: pd = ""
 333         fields["published_date"] = pd
 334
 335         return fields
 336
 337     # def add_gaps(self, fields, fieldname):
 338     #     """
 339     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 340     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 341     #     """
 342     #     def gap():
 343     #         while True:
 344     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 345     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 346
 347     def get_master(self, root):
 348         """
 349         Returns the first master tag from an etree.
 350         """
 351         for master in root.iter():
 352             if master.tag in self.master_tags:
 353                 return master
 354
 355     def index_content(self, book, book_fields={}):
 356         """
 357         Walks the book XML and extract content from it.
 358         Adds parts for each header tag and for each fragment.
 359         """
 360         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 361         root = wld.edoc.getroot()
 362
 363         master = self.get_master(root)
 364         if master is None:
 365             return []
 366
 367         def walker(node, ignore_tags=[]):
 368
 369             if node.tag not in ignore_tags:
 370                 yield node, None, None
 371                 if node.text is not None:
 372                     yield None, node.text, None
 373                 for child in list(node):
 374                     for b, t, e in walker(child):
 375                         yield b, t, e
 376                 yield None, None, node
 377
 378             if node.tail is not None:
 379                 yield None, node.tail, None
 380             return
 381
 382         def fix_format(text):
 383             #            separator = [u" ", u"\t", u".", u";", u","]
 384             if isinstance(text, list):
 385                 # need to join it first
 386                 text = filter(lambda s: s is not None, content)
 387                 text = u' '.join(text)
 388                 # for i in range(len(text)):
 389                 #     if i > 0:
 390                 #         if text[i][0] not in separator\
 391                 #             and text[i - 1][-1] not in separator:
 392                 #          text.insert(i, u" ")
 393
 394             return re.sub("(?m)/$", "", text)
 395
 396         def add_part(snippets, **fields):
 397             doc = self.create_book_doc(book)
 398             for n, v in book_fields.items():
 399                 doc[n] = v
 400
 401             doc['header_index'] = fields["header_index"]
 402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 403             doc['header_type'] = fields['header_type']
 404
 405             doc['text'] = fields['text']
 406
 407             # snippets
 408             snip_pos = snippets.add(fields["text"])
 409
 410             doc['snippets_position'] = snip_pos[0]
 411             doc['snippets_length'] = snip_pos[1]
 412             if snippets.revision:
 413                 doc["snippets_revision"] = snippets.revision
 414
 415             if 'fragment_anchor' in fields:
 416                 doc["fragment_anchor"] = fields['fragment_anchor']
 417
 418             if 'themes' in fields:
 419                 doc['themes'] = fields['themes']
 420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 421                                          doc['header_span'],
 422                                          doc.get('fragment_anchor', ''))
 423             return doc
 424
 425         def give_me_utf8(s):
 426             if isinstance(s, unicode):
 427                 return s.encode('utf-8')
 428             else:
 429                 return s
 430
 431         fragments = {}
 432         snippets = Snippets(book.id).open('w')
 433         try:
 434             for header, position in zip(list(master), range(len(master))):
 435
 436                 if header.tag in self.skip_header_tags:
 437                     continue
 438                 if header.tag is etree.Comment:
 439                     continue
 440
 441                 # section content
 442                 content = []
 443                 footnote = []
 444
 445                 def all_content(text):
 446                     for frag in fragments.values():
 447                         frag['text'].append(text)
 448                     content.append(text)
 449                 handle_text = [all_content]
 450
 451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 452                     # handle footnotes
 453                     if start is not None and start.tag in self.footnote_tags:
 454                         footnote = []
 455
 456                         def collect_footnote(t):
 457                             footnote.append(t)
 458
 459                         handle_text.append(collect_footnote)
 460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 461                         handle_text.pop()
 462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 463                                        text=u''.join(footnote),
 464                                        is_footnote=True)
 465                         self.index.add(doc)
 466                         #print "@ footnote text: %s" % footnote
 467                         footnote = []
 468
 469                     # handle fragments and themes.
 470                     if start is not None and start.tag == 'begin':
 471                         fid = start.attrib['id'][1:]
 472                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 473
 474                     # themes for this fragment
 475                     elif start is not None and start.tag == 'motyw':
 476                         fid = start.attrib['id'][1:]
 477                         handle_text.append(None)
 478                         if start.text is not None:
 479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 480                     elif end is not None and end.tag == 'motyw':
 481                         handle_text.pop()
 482
 483                     elif start is not None and start.tag == 'end':
 484                         fid = start.attrib['id'][1:]
 485                         if fid not in fragments:
 486                             continue  # a broken <end> node, skip it
 487                         frag = fragments[fid]
 488                         if frag['themes'] == []:
 489                             continue  # empty themes list.
 490                         del fragments[fid]
 491
 492                         doc = add_part(snippets,
 493                                        header_type=frag['start_header'],
 494                                        header_index=frag['start_section'],
 495                                        header_span=position - frag['start_section'] + 1,
 496                                        fragment_anchor=fid,
 497                                        text=fix_format(frag['text']),
 498                                        themes=frag['themes'])
 499                         #print '@ FRAG %s' % frag['content']
 500                         self.index.add(doc)
 501
 502                         # Collect content.
 503
 504                     if text is not None and handle_text is not []:
 505                         hdl = handle_text[-1]
 506                         if hdl is not None:
 507                             hdl(text)
 508
 509                         # in the end, add a section text.
 510                 doc = add_part(snippets, header_index=position,
 511                                header_type=header.tag, text=fix_format(content))
 512                 #print '@ CONTENT: %s' % fix_format(content)
 513
 514                 self.index.add(doc)
 515
 516         finally:
 517             snippets.close()
 518
 519
 520 class SearchResult(object):
 521     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 522         #        self.search = search
 523         self.boost = 1.0
 524         self._hits = []
 525         self._processed_hits = None  # processed hits
 526         self.snippets = []
 527         self.query_terms = query_terms
 528
 529         if 'score' in doc:
 530             self._score = doc['score']
 531         else:
 532             self._score = 0
 533
 534         self.book_id = int(doc["book_id"])
 535
 536         try:
 537             self.published_date = int(doc.get("published_date"))
 538         except ValueError:
 539             self.published_date = 0
 540
 541         # content hits
 542         header_type = doc.get("header_type", None)
 543         # we have a content hit in some header of fragment
 544         if header_type is not None:
 545             sec = (header_type, int(doc["header_index"]))
 546             header_span = doc['header_span']
 547             header_span = header_span is not None and int(header_span) or 1
 548             fragment = doc.get("fragment_anchor", None)
 549             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 550             snippets_rev = doc['snippets_revision']
 551
 552             hit = (sec + (header_span,), fragment, self._score, {
 553                 'how_found': how_found,
 554                 'snippets_pos': snippets_pos,
 555                 'snippets_revision': snippets_rev,
 556                 'themes': doc.get('themes', []),
 557                 'themes_pl': doc.get('themes_pl', [])
 558                 })
 559
 560             self._hits.append(hit)
 561
 562     def __unicode__(self):
 563         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 564             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 565
 566     def __str__(self):
 567         return unicode(self).encode('utf-8')
 568
 569     @property
 570     def score(self):
 571         return self._score * self.boost
 572
 573     def merge(self, other):
 574         if self.book_id != other.book_id:
 575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 576         self._hits += other._hits
 577         if other.score > self.score:
 578             self._score = other._score
 579         return self
 580
 581     def get_book(self):
 582         if hasattr(self, '_book'):
 583             return self._book
 584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 585         return self._book
 586
 587     book = property(get_book)
 588
 589     POSITION = 0
 590     FRAGMENT = 1
 591     POSITION_INDEX = 1
 592     POSITION_SPAN = 2
 593     SCORE = 2
 594     OTHER = 3
 595
 596     @property
 597     def hits(self):
 598         if self._processed_hits is not None:
 599             return self._processed_hits
 600
 601         # to sections and fragments
 602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 603
 604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 605
 606         # sections not covered by fragments
 607         sect = filter(lambda s: 0 == len(filter(
 608             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 609             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 610             frags)), sect)
 611
 612         hits = []
 613
 614         def remove_duplicates(lst, keyfn, compare):
 615             els = {}
 616             for e in lst:
 617                 eif = keyfn(e)
 618                 if eif in els:
 619                     if compare(els[eif], e) >= 1:
 620                         continue
 621                 els[eif] = e
 622             return els.values()
 623
 624         # remove fragments with duplicated fid's and duplicated snippets
 625         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 626         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 627         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 628
 629         # remove duplicate sections
 630         sections = {}
 631
 632         for s in sect:
 633             si = s[self.POSITION][self.POSITION_INDEX]
 634             # skip existing
 635             if si in sections:
 636                 if sections[si]['score'] >= s[self.SCORE]:
 637                     continue
 638
 639             m = {'score': s[self.SCORE],
 640                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 641                  }
 642             m.update(s[self.OTHER])
 643             sections[si] = m
 644
 645         hits = sections.values()
 646
 647         for f in frags:
 648             try:
 649                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 650             except catalogue.models.Fragment.DoesNotExist:
 651                 # stale index
 652                 continue
 653             # Figure out if we were searching for a token matching some word in theme name.
 654             themes = frag.tags.filter(category='theme')
 655             themes_hit = set()
 656             if self.query_terms is not None:
 657                 for i in range(0, len(f[self.OTHER]['themes'])):
 658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 659                     tms = map(unicode.lower, tms)
 660                     for qt in self.query_terms:
 661                         if qt in tms:
 662                             themes_hit.add(f[self.OTHER]['themes'][i])
 663                             break
 664
 665             def theme_by_name(n):
 666                 th = filter(lambda t: t.name == n, themes)
 667                 if th:
 668                     return th[0]
 669                 else:
 670                     return None
 671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 672
 673             m = {'score': f[self.SCORE],
 674                  'fragment': frag,
 675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 676                  'themes': themes,
 677                  'themes_hit': themes_hit
 678                  }
 679             m.update(f[self.OTHER])
 680             hits.append(m)
 681
 682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 683
 684         self._processed_hits = hits
 685
 686         return hits
 687
 688     @staticmethod
 689     def aggregate(*result_lists):
 690         books = {}
 691         for rl in result_lists:
 692             for r in rl:
 693                 if r.book_id in books:
 694                     books[r.book_id].merge(r)
 695                 else:
 696                     books[r.book_id] = r
 697         return books.values()
 698
 699     def __cmp__(self, other):
 700         c = cmp(self.score, other.score)
 701         if c == 0:
 702             # this is inverted, because earlier date is better
 703             return cmp(other.published_date, self.published_date)
 704         else:
 705             return c
 706
 707     def __len__(self):
 708         return len(self.hits)
 709
 710     def snippet_pos(self, idx=0):
 711         return self.hits[idx]['snippets_pos']
 712
 713     def snippet_revision(self, idx=0):
 714         try:
 715             return self.hits[idx]['snippets_revision']
 716         except:
 717             return None
 718
 719
 720 class Search(SolrIndex):
 721     """
 722     Search facilities.
 723     """
 724     def __init__(self, default_field="text"):
 725         super(Search, self).__init__(mode='r')
 726
 727     # def get_tokens(self, searched, field='text', cached=None):
 728     #     """returns tokens analyzed by a proper (for a field) analyzer
 729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 731     #     """
 732     #     if cached is not None and field in cached:
 733     #         return cached[field]
 734
 735     #     if isinstance(searched, str) or isinstance(searched, unicode):
 736     #         searched = StringReader(searched)
 737     #     elif isinstance(searched, list):
 738     #         return searched
 739
 740     #     searched.reset()
 741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 742     #     toks = []
 743     #     while tokens.incrementToken():
 744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 745     #         toks.append(cta.toString())
 746
 747     #     if cached is not None:
 748     #         cached[field] = toks
 749
 750     #     return toks
 751
 752     # @staticmethod
 753     # def fuzziness(fuzzy):
 754     #     """Helper method to sanitize fuzziness"""
 755     #     if not fuzzy:
 756     #         return None
 757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 758     #         return fuzzy
 759     #     else:
 760     #         return 0.5
 761
 762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 763     #     """
 764     #     Return a PhraseQuery with a series of tokens.
 765     #     """
 766     #     if fuzzy:
 767     #         phrase = MultiPhraseQuery()
 768     #         for t in tokens:
 769     #             term = Term(field, t)
 770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 771     #             fuzzterms = []
 772
 773     #             while True:
 774     #                 ft = fuzzterm.term()
 775     #                 if ft:
 776     #                     fuzzterms.append(ft)
 777     #                 if not fuzzterm.next(): break
 778     #             if fuzzterms:
 779     #                 phrase.add(JArray('object')(fuzzterms, Term))
 780     #             else:
 781     #                 phrase.add(term)
 782     #     else:
 783     #         phrase = PhraseQuery()
 784     #         phrase.setSlop(slop)
 785     #         for t in tokens:
 786     #             term = Term(field, t)
 787     #             phrase.add(term)
 788     #     return phrase
 789
 790     def make_term_query(self, query, field='text', modal=operator.or_):
 791         """
 792         Returns term queries joined by boolean query.
 793         modal - applies to boolean query
 794         fuzzy - should the query by fuzzy.
 795         """
 796         if query is None: query = ''
 797         q = self.index.Q()
 798         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 799                         query.split(r" ")), q)
 800
 801         return q
 802
 803     def search_phrase(self, searched, field='text', book=False,
 804                       filters=None,
 805                       snippets=False):
 806         if filters is None: filters = []
 807         if book: filters.append(self.index.Q(is_book=True))
 808
 809         q = self.index.query(**{field: searched})
 810         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 811         res = q.execute()
 812         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 813
 814     def search_some(self, searched, fields, book=True,
 815                     filters=None, snippets=True, query_terms=None):
 816         assert isinstance(fields, list)
 817         if filters is None: filters = []
 818         if book: filters.append(self.index.Q(is_book=True))
 819
 820         query = self.index.Q()
 821
 822         for fld in fields:
 823             query = self.index.Q(query | self.make_term_query(searched, fld))
 824
 825         query = self.index.query(query)
 826         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 827         res = query.execute()
 828         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 829
 830     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 831     #     """
 832     #     Search for perfect book matches. Just see if the query matches with some author or title,
 833     #     taking hints into account.
 834     #     """
 835     #     fields_to_search = ['authors', 'title']
 836     #     only_in = None
 837     #     if hint:
 838     #         if not hint.should_search_for_book():
 839     #             return []
 840     #         fields_to_search = hint.just_search_in(fields_to_search)
 841     #         only_in = hint.book_filter()
 842
 843     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 844
 845     #     books = []
 846     #     for q in qrys:
 847     #         top = self.searcher.search(q,
 848     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 849     #             max_results)
 850     #         for found in top.scoreDocs:
 851     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 852     #     return books
 853
 854     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 855     #     fields_to_search = ['tags', 'authors', 'title']
 856
 857     #     only_in = None
 858     #     if hint:
 859     #         if not hint.should_search_for_book():
 860     #             return []
 861     #         fields_to_search = hint.just_search_in(fields_to_search)
 862     #         only_in = hint.book_filter()
 863
 864     #     tokens = self.get_tokens(searched, field='SIMPLE')
 865
 866     #     q = BooleanQuery()
 867
 868     #     for fld in fields_to_search:
 869     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 870     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 871
 872     #     books = []
 873     #     top = self.searcher.search(q,
 874     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 875     #         max_results)
 876     #     for found in top.scoreDocs:
 877     #         books.append(SearchResult(self, found, how_found="search_book"))
 878
 879     #     return books
 880
 881     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 882     #     """
 883     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 884     #     some part/fragment of the book.
 885     #     """
 886     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 887
 888     #     flt = None
 889     #     if hint:
 890     #         flt = hint.part_filter()
 891
 892     #     books = []
 893     #     for q in qrys:
 894     #         top = self.searcher.search(q,
 895     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 896     #                                                        flt]),
 897     #                                    max_results)
 898     #         for found in top.scoreDocs:
 899     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 900
 901     #     return books
 902
 903     def search_everywhere(self, searched, query_terms=None):
 904         """
 905         Tries to use search terms to match different fields of book (or its parts).
 906         E.g. one word can be an author survey, another be a part of the title, and the rest
 907         are some words from third chapter.
 908         """
 909         books = []
 910         # content only query : themes x content
 911         q = self.make_term_query(searched, 'text')
 912         q_themes = self.make_term_query(searched, 'themes_pl')
 913
 914         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 915         res = query.execute()
 916
 917         for found in res:
 918             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 919
 920         # query themes/content x author/title/tags
 921         in_content = self.index.Q()
 922         in_meta = self.index.Q()
 923
 924         for fld in ['themes_pl', 'text']:
 925             in_content |= self.make_term_query(searched, field=fld)
 926
 927         for fld in ['tags', 'authors', 'title']:
 928             in_meta |= self.make_term_query(searched, field=fld)
 929
 930         q = in_content & in_meta
 931         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 932
 933         for found in res:
 934             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 935
 936         return books
 937
 938     def get_snippets(self, searchresult, query, field='text', num=1):
 939         """
 940         Returns a snippet for found scoreDoc.
 941         """
 942         maxnum = len(searchresult)
 943         if num is None or num < 0 or num > maxnum:
 944             num = maxnum
 945         book_id = searchresult.book_id
 946         revision = searchresult.snippet_revision()
 947         snippets = Snippets(book_id, revision=revision)
 948         snips = [None] * maxnum
 949         try:
 950             snippets.open()
 951             idx = 0
 952             while idx < maxnum and num > 0:
 953                 position, length = searchresult.snippet_pos(idx)
 954                 if position is None or length is None:
 955                     continue
 956                 text = snippets.get((int(position),
 957                                      int(length)))
 958                 snip = self.index.highlight(text=text, field=field, q=query)
 959                 snips[idx] = snip
 960                 if snip:
 961                     num -= 1
 962                 idx += 1
 963
 964         except IOError, e:
 965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 966             return []
 967         finally:
 968             snippets.close()
 969
 970             # remove verse end markers..
 971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 972
 973         searchresult.snippets = snips
 974
 975         return snips
 976
 977     def hint_tags(self, query, pdcounter=True, prefix=True):
 978         """
 979         Return auto-complete hints for tags
 980         using prefix search.
 981         """
 982         q = self.index.Q()
 983         query = query.strip()
 984         for field in ['tag_name', 'tag_name_pl']:
 985             if prefix:
 986                 q |= self.index.Q(**{field: query + "*"})
 987             else:
 988                 q |= self.make_term_query(query, field=field)
 989         qu = self.index.query(q).exclude(tag_category="book")
 990
 991         return self.search_tags(qu, pdcounter=pdcounter)
 992
 993     def search_tags(self, query, filters=None, pdcounter=False):
 994         """
 995         Search for Tag objects using query.
 996         """
 997         if not filters: filters = []
 998         if not pdcounter:
 999             filters.append(~self.index.Q(is_pdcounter=True))
1000         res = self.apply_filters(query, filters).execute()
1001
1002         tags = []
1003         for doc in res:
1004             is_pdcounter = doc.get('is_pdcounter', False)
1005             category = doc.get('tag_category')
1006             try:
1007                 if is_pdcounter == True:
1008                     if category == 'pd_author':
1009                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1010                     elif category == 'pd_book':
1011                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1012                         tag.category = 'pd_book'  # make it look more lik a tag.
1013                     else:
1014                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1015                 else:
1016                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1017                     # don't add the pdcounter tag if same tag already exists
1018
1019                 tags.append(tag)
1020
1021             except catalogue.models.Tag.DoesNotExist: pass
1022             except PDCounterAuthor.DoesNotExist: pass
1023             except PDCounterBook.DoesNotExist: pass
1024
1025         log.debug('search_tags: %s' % tags)
1026
1027         return tags
1028
1029     def hint_books(self, query, prefix=True):
1030         """
1031         Returns auto-complete hints for book titles
1032         Because we do not index 'pseudo' title-tags.
1033         Prefix search.
1034         """
1035         q = self.index.Q()
1036         query = query.strip()
1037         if prefix:
1038             q |= self.index.Q(title=query + "*")
1039         else:
1040             q |= self.make_term_query(query, field='title')
1041         qu = self.index.query(q)
1042         only_books = self.index.Q(is_book=True)
1043         return self.search_books(qu, [only_books])
1044
1045     def search_books(self, query, filters=None, max_results=10):
1046         """
1047         Searches for Book objects using query
1048         """
1049         bks = []
1050         res = self.apply_filters(query, filters).field_limit(['book_id'])
1051         for r in res:
1052             try:
1053                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1054             except catalogue.models.Book.DoesNotExist: pass
1055         return bks
1056
1057     # def make_prefix_phrase(self, toks, field):
1058     #     q = MultiPhraseQuery()
1059     #     for i in range(len(toks)):
1060     #         t = Term(field, toks[i])
1061     #         if i == len(toks) - 1:
1062     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1063     #             if pterms:
1064     #                 q.add(pterms)
1065     #             else:
1066     #                 q.add(t)
1067     #         else:
1068     #             q.add(t)
1069     #     return q
1070
1071     # @staticmethod
1072     # def term_filter(term, inverse=False):
1073     #     only_term = TermsFilter()
1074     #     only_term.addTerm(term)
1075
1076     #     if inverse:
1077     #         neg = BooleanFilter()
1078     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1079     #         only_term = neg
1080
1081     #     return only_term
1082
1083
1084
1085     @staticmethod
1086     def apply_filters(query, filters):
1087         """
1088         Apply filters to a query
1089         """
1090         if filters is None: filters = []
1091         filters = filter(lambda x: x is not None, filters)
1092         for f in filters:
1093             query = query.query(f)
1094         return query
1095
1096     # def filtered_categories(self, tags):
1097     #     """
1098     #     Return a list of tag categories, present in tags list.
1099     #     """
1100     #     cats = {}
1101     #     for t in tags:
1102     #         cats[t.category] = True
1103     #     return cats.keys()
1104
1105     # def hint(self):
1106     #     return Hint(self)