apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21
  22 class SolrIndex(object):
  23     def __init__(self, mode=None):
  24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  25
  26
  27 class Snippets(object):
  28     """
  29     This class manages snippet files for indexed object (book)
  30     the snippets are concatenated together, and their positions and
  31     lengths are kept in lucene index fields.
  32     """
  33     SNIPPET_DIR = "snippets"
  34
  35     def __init__(self, book_id, revision=None):
  36         try:
  37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  38         except OSError as exc:
  39             if exc.errno == errno.EEXIST:
  40                 pass
  41             else: raise
  42         self.book_id = book_id
  43         self.revision = revision
  44         self.file = None
  45
  46     @property
  47     def path(self):
  48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  49         else: fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if not 'b' in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         self.file.close()
  96
  97     def remove(self):
  98         self.revision = None
  99         try:
 100             os.unlink(self.path)
 101             self.revision = 0
 102             while True:
 103                 self.revision += 1
 104                 os.unlink(self.path)
 105         except OSError:
 106             pass
 107
 108
 109 class Index(SolrIndex):
 110     """
 111     Class indexing books.
 112     """
 113     def __init__(self):
 114         super(Index, self).__init__(mode='rw')
 115
 116     def delete_query(self, *queries):
 117         """
 118         index.delete(queries=...) doesn't work, so let's reimplement it
 119         using deletion of list of uids.
 120         """
 121         uids = set()
 122         for q in queries:
 123             if isinstance(q, sunburnt.search.LuceneQuery):
 124                 q = self.index.query(q)
 125             q.field_limiter.update(['uid'])
 126             st = 0
 127             rows = 100
 128             while True:
 129                 ids = q.paginate(start=st, rows=rows).execute()
 130                 if not len(ids):
 131                     break
 132                 for res in ids:
 133                     uids.add(res['uid'])
 134                 st += rows
 135                 #        print "Will delete %s" % ','.join([x for x in uids])
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         remove_only = kw.get('remove_only', False)
 149         # first, remove tags from index.
 150         if tags:
 151             tag_qs = []
 152             for tag in tags:
 153                 q_id = self.index.Q(tag_id=tag.id)
 154
 155                 if isinstance(tag, PDCounterAuthor):
 156                     q_cat = self.index.Q(tag_category='pd_author')
 157                 elif isinstance(tag, PDCounterBook):
 158                     q_cat = self.index.Q(tag_category='pd_book')
 159                 else:
 160                     q_cat = self.index.Q(tag_category=tag.category)
 161
 162                 q_id_cat = self.index.Q(q_id & q_cat)
 163                 tag_qs.append(q_id_cat)
 164             self.delete_query(tag_qs)
 165         else:  # all
 166             q = self.index.Q(tag_id__any=True)
 167             self.delete_query(q)
 168
 169         if not remove_only:
 170             # then add them [all or just one passed]
 171             if not tags:
 172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 173                     PDCounterAuthor.objects.all(), \
 174                     PDCounterBook.objects.all())
 175
 176             for tag in tags:
 177                 if isinstance(tag, PDCounterAuthor):
 178                     doc = {
 179                         "tag_id": int(tag.id),
 180                         "tag_name": tag.name,
 181                         "tag_name_pl": tag.name,
 182                         "tag_category": 'pd_author',
 183                         "is_pdcounter": True,
 184                         "uid": "tag%d_pd_a" % tag.id
 185                         }
 186                 elif isinstance(tag, PDCounterBook):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.title,
 190                         "tag_name_pl": tag.title,
 191                         "tag_category": 'pd_book',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_b" % tag.id
 194                         }
 195                 else:
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.name,
 199                         "tag_name_pl": tag.name,
 200                         "tag_category": tag.category,
 201                         "is_pdcounter": False,
 202                         "uid": "tag%d" % tag.id
 203                         }
 204                 print "ADD 1 %s" % doc
 205                 self.index.add(doc)
 206
 207     def create_book_doc(self, book):
 208         """
 209         Create a lucene document referring book id.
 210         """
 211         doc = {
 212             'book_id': int(book.id),
 213             }
 214         if book.parent is not None:
 215             doc["parent_id"] = int(book.parent.id)
 216         return doc
 217
 218     def remove_book(self, book_or_id, remove_snippets=True):
 219         """Removes a book from search index.
 220         book - Book instance."""
 221         if isinstance(book_or_id, catalogue.models.Book):
 222             book_id = book_or_id.id
 223         else:
 224             book_id = book_or_id
 225
 226         self.delete_query(self.index.Q(book_id=book_id))
 227
 228         if remove_snippets:
 229             snippets = Snippets(book_id)
 230             snippets.remove()
 231
 232     def index_book(self, book, book_info=None, overwrite=True):
 233         """
 234         Indexes the book.
 235         Creates a lucene document for extracted metadata
 236         and calls self.index_content() to index the contents of the book.
 237         """
 238         if overwrite:
 239             # we don't remove snippets, since they might be still needed by
 240             # threads using not reopened index
 241             self.remove_book(book, remove_snippets=False)
 242
 243         book_doc = self.create_book_doc(book)
 244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         self.index.add(book_doc)
 254         del book_doc
 255         book_fields = {
 256             'title': meta_fields['title'],
 257             'authors': meta_fields['authors'],
 258             'published_date': meta_fields['published_date']
 259             }
 260         if 'translators' in meta_fields:
 261             book_fields['translators'] = meta_fields['translators']
 262
 263         self.index_content(book, book_fields=book_fields)
 264
 265     master_tags = [
 266         'opowiadanie',
 267         'powiesc',
 268         'dramat_wierszowany_l',
 269         'dramat_wierszowany_lp',
 270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 271         'wywiad',
 272         ]
 273
 274     ignore_content_tags = [
 275         'uwaga', 'extra',
 276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 277         'didaskalia',
 278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 279         ]
 280
 281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 282
 283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 284
 285     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 286
 287     def extract_metadata(self, book, book_info=None, dc_only=None):
 288         """
 289         Extract metadata from book and returns a map of fields keyed by fieldname
 290         """
 291         fields = {}
 292
 293         if book_info is None:
 294             book_info = dcparser.parse(open(book.xml_file.path))
 295
 296         fields['slug'] = book.slug
 297         fields['tags'] = [t.name  for t in book.tags]
 298         fields['is_book'] = True
 299
 300         # validator, name
 301         for field in dcparser.BookInfo.FIELDS:
 302             if dc_only and field.name not in dc_only:
 303                 continue
 304             if hasattr(book_info, field.name):
 305                 if not getattr(book_info, field.name):
 306                     continue
 307                 # since no type information is available, we use validator
 308                 type_indicator = field.validator
 309                 if type_indicator == dcparser.as_unicode:
 310                     s = getattr(book_info, field.name)
 311                     if field.multiple:
 312                         s = ', '.join(s)
 313                     fields[field.name] = s
 314                 elif type_indicator == dcparser.as_person:
 315                     p = getattr(book_info, field.name)
 316                     if isinstance(p, dcparser.Person):
 317                         persons = unicode(p)
 318                     else:
 319                         persons = ', '.join(map(unicode, p))
 320                     fields[field.name] = persons
 321                 elif type_indicator == dcparser.as_date:
 322                     dt = getattr(book_info, field.name)
 323                     fields[field.name] = dt
 324
 325         # get published date
 326         pd = None
 327         if hasattr(book_info, 'source_name') and book_info.source_name:
 328             match = self.published_date_re.search(book_info.source_name)
 329             if match is not None:
 330                 pd = str(match.groups()[0])
 331         if not pd: pd = ""
 332         fields["published_date"] = pd
 333
 334         return fields
 335
 336     # def add_gaps(self, fields, fieldname):
 337     #     """
 338     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 339     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 340     #     """
 341     #     def gap():
 342     #         while True:
 343     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 344     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 345
 346     def get_master(self, root):
 347         """
 348         Returns the first master tag from an etree.
 349         """
 350         for master in root.iter():
 351             if master.tag in self.master_tags:
 352                 return master
 353
 354     def index_content(self, book, book_fields={}):
 355         """
 356         Walks the book XML and extract content from it.
 357         Adds parts for each header tag and for each fragment.
 358         """
 359         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 360         root = wld.edoc.getroot()
 361
 362         master = self.get_master(root)
 363         if master is None:
 364             return []
 365
 366         def walker(node, ignore_tags=[]):
 367
 368             if node.tag not in ignore_tags:
 369                 yield node, None, None
 370                 if node.text is not None:
 371                     yield None, node.text, None
 372                 for child in list(node):
 373                     for b, t, e in walker(child):
 374                         yield b, t, e
 375                 yield None, None, node
 376
 377             if node.tail is not None:
 378                 yield None, node.tail, None
 379             return
 380
 381         def fix_format(text):
 382             #            separator = [u" ", u"\t", u".", u";", u","]
 383             if isinstance(text, list):
 384                 # need to join it first
 385                 text = filter(lambda s: s is not None, content)
 386                 text = u' '.join(text)
 387                 # for i in range(len(text)):
 388                 #     if i > 0:
 389                 #         if text[i][0] not in separator\
 390                 #             and text[i - 1][-1] not in separator:
 391                 #          text.insert(i, u" ")
 392
 393             return re.sub("(?m)/$", "", text)
 394
 395         def add_part(snippets, **fields):
 396             doc = self.create_book_doc(book)
 397             for n, v in book_fields.items():
 398                 doc[n] = v
 399
 400             doc['header_index'] = fields["header_index"]
 401             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 402             doc['header_type'] = fields['header_type']
 403
 404             doc['text'] = fields['text']
 405
 406             # snippets
 407             snip_pos = snippets.add(fields["text"])
 408
 409             doc['snippets_position'] = snip_pos[0]
 410             doc['snippets_length'] = snip_pos[1]
 411             if snippets.revision:
 412                 doc["snippets_revision"] = snippets.revision
 413
 414             if 'fragment_anchor' in fields:
 415                 doc["fragment_anchor"] = fields['fragment_anchor']
 416
 417             if 'themes' in fields:
 418                 doc['themes'] = fields['themes']
 419             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 420                                          doc['header_span'],
 421                                          doc.get('fragment_anchor', ''))
 422             return doc
 423
 424         def give_me_utf8(s):
 425             if isinstance(s, unicode):
 426                 return s.encode('utf-8')
 427             else:
 428                 return s
 429
 430         fragments = {}
 431         snippets = Snippets(book.id).open('w')
 432         try:
 433             for header, position in zip(list(master), range(len(master))):
 434
 435                 if header.tag in self.skip_header_tags:
 436                     continue
 437                 if header.tag is etree.Comment:
 438                     continue
 439
 440                 # section content
 441                 content = []
 442                 footnote = []
 443
 444                 def all_content(text):
 445                     for frag in fragments.values():
 446                         frag['text'].append(text)
 447                     content.append(text)
 448                 handle_text = [all_content]
 449
 450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 451                     # handle footnotes
 452                     if start is not None and start.tag in self.footnote_tags:
 453                         footnote = []
 454
 455                         def collect_footnote(t):
 456                             footnote.append(t)
 457
 458                         handle_text.append(collect_footnote)
 459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 460                         handle_text.pop()
 461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 462                                        text=u''.join(footnote),
 463                                        is_footnote=True)
 464                         self.index.add(doc)
 465                         #print "@ footnote text: %s" % footnote
 466                         footnote = []
 467
 468                     # handle fragments and themes.
 469                     if start is not None and start.tag == 'begin':
 470                         fid = start.attrib['id'][1:]
 471                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 472
 473                     # themes for this fragment
 474                     elif start is not None and start.tag == 'motyw':
 475                         fid = start.attrib['id'][1:]
 476                         handle_text.append(None)
 477                         if start.text is not None:
 478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 479                     elif end is not None and end.tag == 'motyw':
 480                         handle_text.pop()
 481
 482                     elif start is not None and start.tag == 'end':
 483                         fid = start.attrib['id'][1:]
 484                         if fid not in fragments:
 485                             continue  # a broken <end> node, skip it
 486                         frag = fragments[fid]
 487                         if frag['themes'] == []:
 488                             continue  # empty themes list.
 489                         del fragments[fid]
 490
 491                         doc = add_part(snippets,
 492                                        header_type=frag['start_header'],
 493                                        header_index=frag['start_section'],
 494                                        header_span=position - frag['start_section'] + 1,
 495                                        fragment_anchor=fid,
 496                                        text=fix_format(frag['text']),
 497                                        themes=frag['themes'])
 498                         #print '@ FRAG %s' % frag['content']
 499                         self.index.add(doc)
 500
 501                         # Collect content.
 502
 503                     if text is not None and handle_text is not []:
 504                         hdl = handle_text[-1]
 505                         if hdl is not None:
 506                             hdl(text)
 507
 508                         # in the end, add a section text.
 509                 doc = add_part(snippets, header_index=position,
 510                                header_type=header.tag, text=fix_format(content))
 511                 #print '@ CONTENT: %s' % fix_format(content)
 512
 513                 self.index.add(doc)
 514
 515         finally:
 516             snippets.close()
 517
 518
 519 class SearchResult(object):
 520     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 521         #        self.search = search
 522         self.boost = 1.0
 523         self._hits = []
 524         self._processed_hits = None  # processed hits
 525         self.snippets = []
 526         self.query_terms = query_terms
 527
 528         if 'score' in doc:
 529             self._score = doc['score']
 530         else:
 531             self._score = 0
 532
 533         self.book_id = int(doc["book_id"])
 534
 535         try:
 536             self.published_date = int(doc.get("published_date"))
 537         except ValueError:
 538             self.published_date = 0
 539
 540         # content hits
 541         header_type = doc.get("header_type", None)
 542         # we have a content hit in some header of fragment
 543         if header_type is not None:
 544             sec = (header_type, int(doc["header_index"]))
 545             header_span = doc['header_span']
 546             header_span = header_span is not None and int(header_span) or 1
 547             fragment = doc.get("fragment_anchor", None)
 548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 549             snippets_rev = doc['snippets_revision']
 550
 551             hit = (sec + (header_span,), fragment, self._score, {
 552                 'how_found': how_found,
 553                 'snippets_pos': snippets_pos,
 554                 'snippets_revision': snippets_rev,
 555                 'themes': doc.get('themes', []),
 556                 'themes_pl': doc.get('themes_pl', [])
 557                 })
 558
 559             self._hits.append(hit)
 560
 561     def __unicode__(self):
 562         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 563             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 564
 565     def __str__(self):
 566         return unicode(self).encode('utf-8')
 567
 568     @property
 569     def score(self):
 570         return self._score * self.boost
 571
 572     def merge(self, other):
 573         if self.book_id != other.book_id:
 574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 575         self._hits += other._hits
 576         if other.score > self.score:
 577             self._score = other._score
 578         return self
 579
 580     def get_book(self):
 581         if hasattr(self, '_book'):
 582             return self._book
 583         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 584         return self._book
 585
 586     book = property(get_book)
 587
 588     POSITION = 0
 589     FRAGMENT = 1
 590     POSITION_INDEX = 1
 591     POSITION_SPAN = 2
 592     SCORE = 2
 593     OTHER = 3
 594
 595     @property
 596     def hits(self):
 597         if self._processed_hits is not None:
 598             return self._processed_hits
 599
 600         # to sections and fragments
 601         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 602
 603         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 604
 605         # sections not covered by fragments
 606         sect = filter(lambda s: 0 == len(filter(
 607             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 608             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 609             frags)), sect)
 610
 611         hits = []
 612
 613         def remove_duplicates(lst, keyfn, compare):
 614             els = {}
 615             for e in lst:
 616                 eif = keyfn(e)
 617                 if eif in els:
 618                     if compare(els[eif], e) >= 1:
 619                         continue
 620                 els[eif] = e
 621             return els.values()
 622
 623         # remove fragments with duplicated fid's and duplicated snippets
 624         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 625         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 626         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 627
 628         # remove duplicate sections
 629         sections = {}
 630
 631         for s in sect:
 632             si = s[self.POSITION][self.POSITION_INDEX]
 633             # skip existing
 634             if si in sections:
 635                 if sections[si]['score'] >= s[self.SCORE]:
 636                     continue
 637
 638             m = {'score': s[self.SCORE],
 639                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 640                  }
 641             m.update(s[self.OTHER])
 642             sections[si] = m
 643
 644         hits = sections.values()
 645
 646         for f in frags:
 647             try:
 648                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 649             except catalogue.models.Fragment.DoesNotExist:
 650                 # stale index
 651                 continue
 652             print f
 653             # Figure out if we were searching for a token matching some word in theme name.
 654             themes = frag.tags.filter(category='theme')
 655             themes_hit = set()
 656             if self.query_terms is not None:
 657                 for i in range(0, len(f[self.OTHER]['themes'])):
 658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 659                     tms = map(unicode.lower, tms)
 660                     for qt in self.query_terms:
 661                         if qt in tms:
 662                             themes_hit.add(f[self.OTHER]['themes'][i])
 663                             break
 664
 665             def theme_by_name(n):
 666                 th = filter(lambda t: t.name == n, themes)
 667                 if th:
 668                     return th[0]
 669                 else:
 670                     return None
 671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 672
 673             m = {'score': f[self.SCORE],
 674                  'fragment': frag,
 675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 676                  'themes': themes,
 677                  'themes_hit': themes_hit
 678                  }
 679             m.update(f[self.OTHER])
 680             hits.append(m)
 681
 682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 683
 684         self._processed_hits = hits
 685
 686         return hits
 687
 688     @staticmethod
 689     def aggregate(*result_lists):
 690         books = {}
 691         for rl in result_lists:
 692             for r in rl:
 693                 if r.book_id in books:
 694                     books[r.book_id].merge(r)
 695                 else:
 696                     books[r.book_id] = r
 697         return books.values()
 698
 699     def __cmp__(self, other):
 700         c = cmp(self.score, other.score)
 701         if c == 0:
 702             # this is inverted, because earlier date is better
 703             return cmp(other.published_date, self.published_date)
 704         else:
 705             return c
 706
 707     def __len__(self):
 708         return len(self.hits)
 709
 710     def snippet_pos(self, idx=0):
 711         return self.hits[idx]['snippets_pos']
 712
 713     def snippet_revision(self, idx=0):
 714         try:
 715             return self.hits[idx]['snippets_revision']
 716         except:
 717             return None
 718
 719
 720 class Search(SolrIndex):
 721     """
 722     Search facilities.
 723     """
 724     def __init__(self, default_field="text"):
 725         super(Search, self).__init__(mode='r')
 726
 727     # def get_tokens(self, searched, field='text', cached=None):
 728     #     """returns tokens analyzed by a proper (for a field) analyzer
 729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 731     #     """
 732     #     if cached is not None and field in cached:
 733     #         return cached[field]
 734
 735     #     if isinstance(searched, str) or isinstance(searched, unicode):
 736     #         searched = StringReader(searched)
 737     #     elif isinstance(searched, list):
 738     #         return searched
 739
 740     #     searched.reset()
 741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 742     #     toks = []
 743     #     while tokens.incrementToken():
 744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 745     #         toks.append(cta.toString())
 746
 747     #     if cached is not None:
 748     #         cached[field] = toks
 749
 750     #     return toks
 751
 752     # @staticmethod
 753     # def fuzziness(fuzzy):
 754     #     """Helper method to sanitize fuzziness"""
 755     #     if not fuzzy:
 756     #         return None
 757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 758     #         return fuzzy
 759     #     else:
 760     #         return 0.5
 761
 762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 763     #     """
 764     #     Return a PhraseQuery with a series of tokens.
 765     #     """
 766     #     if fuzzy:
 767     #         phrase = MultiPhraseQuery()
 768     #         for t in tokens:
 769     #             term = Term(field, t)
 770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 771     #             fuzzterms = []
 772
 773     #             while True:
 774     #                 ft = fuzzterm.term()
 775     #                 if ft:
 776     #                     fuzzterms.append(ft)
 777     #                 if not fuzzterm.next(): break
 778     #             if fuzzterms:
 779     #                 phrase.add(JArray('object')(fuzzterms, Term))
 780     #             else:
 781     #                 phrase.add(term)
 782     #     else:
 783     #         phrase = PhraseQuery()
 784     #         phrase.setSlop(slop)
 785     #         for t in tokens:
 786     #             term = Term(field, t)
 787     #             phrase.add(term)
 788     #     return phrase
 789
 790     def make_term_query(self, query, field='text', modal=operator.or_):
 791         """
 792         Returns term queries joined by boolean query.
 793         modal - applies to boolean query
 794         fuzzy - should the query by fuzzy.
 795         """
 796         q = self.index.Q()
 797         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 798                         query.split(r" ")), q)
 799
 800         return q
 801
 802     def search_phrase(self, searched, field='text', book=False,
 803                       filters=None,
 804                       snippets=False):
 805         if filters is None: filters = []
 806         if book: filters.append(self.index.Q(is_book=True))
 807
 808         q = self.index.query(**{field: searched})
 809         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 810         res = q.execute()
 811         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 812
 813     def search_some(self, searched, fields, book=True,
 814                     filters=None, snippets=True, query_terms=None):
 815         assert isinstance(fields, list)
 816         if filters is None: filters = []
 817         if book: filters.append(self.index.Q(is_book=True))
 818
 819         query = self.index.Q()
 820
 821         for fld in fields:
 822             query = self.index.Q(query | self.make_term_query(searched, fld))
 823
 824         query = self.index.query(query)
 825         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 826         res = query.execute()
 827         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 828
 829     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 830     #     """
 831     #     Search for perfect book matches. Just see if the query matches with some author or title,
 832     #     taking hints into account.
 833     #     """
 834     #     fields_to_search = ['authors', 'title']
 835     #     only_in = None
 836     #     if hint:
 837     #         if not hint.should_search_for_book():
 838     #             return []
 839     #         fields_to_search = hint.just_search_in(fields_to_search)
 840     #         only_in = hint.book_filter()
 841
 842     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 843
 844     #     books = []
 845     #     for q in qrys:
 846     #         top = self.searcher.search(q,
 847     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 848     #             max_results)
 849     #         for found in top.scoreDocs:
 850     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 851     #     return books
 852
 853     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 854     #     fields_to_search = ['tags', 'authors', 'title']
 855
 856     #     only_in = None
 857     #     if hint:
 858     #         if not hint.should_search_for_book():
 859     #             return []
 860     #         fields_to_search = hint.just_search_in(fields_to_search)
 861     #         only_in = hint.book_filter()
 862
 863     #     tokens = self.get_tokens(searched, field='SIMPLE')
 864
 865     #     q = BooleanQuery()
 866
 867     #     for fld in fields_to_search:
 868     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 869     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 870
 871     #     books = []
 872     #     top = self.searcher.search(q,
 873     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 874     #         max_results)
 875     #     for found in top.scoreDocs:
 876     #         books.append(SearchResult(self, found, how_found="search_book"))
 877
 878     #     return books
 879
 880     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 881     #     """
 882     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 883     #     some part/fragment of the book.
 884     #     """
 885     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 886
 887     #     flt = None
 888     #     if hint:
 889     #         flt = hint.part_filter()
 890
 891     #     books = []
 892     #     for q in qrys:
 893     #         top = self.searcher.search(q,
 894     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 895     #                                                        flt]),
 896     #                                    max_results)
 897     #         for found in top.scoreDocs:
 898     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 899
 900     #     return books
 901
 902     def search_everywhere(self, searched, query_terms=None):
 903         """
 904         Tries to use search terms to match different fields of book (or its parts).
 905         E.g. one word can be an author survey, another be a part of the title, and the rest
 906         are some words from third chapter.
 907         """
 908         books = []
 909         # content only query : themes x content
 910         q = self.make_term_query(searched, 'text')
 911         q_themes = self.make_term_query(searched, 'themes_pl')
 912
 913         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 914         res = query.execute()
 915
 916         for found in res:
 917             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 918
 919         # query themes/content x author/title/tags
 920         in_content = self.index.Q()
 921         in_meta = self.index.Q()
 922
 923         for fld in ['themes_pl', 'text']:
 924             in_content |= self.make_term_query(searched, field=fld)
 925
 926         for fld in ['tags', 'authors', 'title']:
 927             in_meta |= self.make_term_query(searched, field=fld)
 928
 929         q = in_content & in_meta
 930         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 931
 932         for found in res:
 933             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 934
 935         return books
 936
 937     def get_snippets(self, searchresult, query, field='text', num=1):
 938         """
 939         Returns a snippet for found scoreDoc.
 940         """
 941         maxnum = len(searchresult)
 942         if num is None or num < 0 or num > maxnum:
 943             num = maxnum
 944         book_id = searchresult.book_id
 945         revision = searchresult.snippet_revision()
 946         snippets = Snippets(book_id, revision=revision)
 947         snips = [None] * maxnum
 948         try:
 949             snippets.open()
 950             idx = 0
 951             while idx < maxnum and num > 0:
 952                 position, length = searchresult.snippet_pos(idx)
 953                 if position is None or length is None:
 954                     continue
 955                 text = snippets.get((int(position),
 956                                      int(length)))
 957                 print "== %s -- %s ==" % (query, text)
 958                 snip = self.index.highlight(text=text, field=field, q=query)
 959                 snips[idx] = snip
 960                 if snip:
 961                     num -= 1
 962                 idx += 1
 963
 964         except IOError, e:
 965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 966             return []
 967         finally:
 968             snippets.close()
 969
 970             # remove verse end markers..
 971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 972
 973         searchresult.snippets = snips
 974         return snips
 975
 976     def hint_tags(self, query, pdcounter=True, prefix=True):
 977         """
 978         Return auto-complete hints for tags
 979         using prefix search.
 980         """
 981         q = self.index.Q()
 982         query = query.strip()
 983         for field in ['tag_name', 'tag_name_pl']:
 984             if prefix:
 985                 q |= self.index.Q(**{field: query + "*"})
 986             else:
 987                 q |= self.make_term_query(query, field=field)
 988         qu = self.index.query(q).exclude(tag_category="book")
 989
 990         return self.search_tags(qu, pdcounter=pdcounter)
 991
 992     def search_tags(self, query, filters=None, pdcounter=False):
 993         """
 994         Search for Tag objects using query.
 995         """
 996         if not filters: filters = []
 997         if not pdcounter:
 998             filters.append(~self.index.Q(is_pdcounter=True))
 999         res = self.apply_filters(query, filters).execute()
1000
1001         tags = []
1002         for doc in res:
1003             is_pdcounter = doc.get('is_pdcounter', False)
1004             category = doc.get('tag_category')
1005             try:
1006                 if is_pdcounter == True:
1007                     if category == 'pd_author':
1008                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1009                     elif category == 'pd_book':
1010                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1011                         tag.category = 'pd_book'  # make it look more lik a tag.
1012                     else:
1013                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1014                 else:
1015                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016                     # don't add the pdcounter tag if same tag already exists
1017
1018                 tags.append(tag)
1019
1020             except catalogue.models.Tag.DoesNotExist: pass
1021             except PDCounterAuthor.DoesNotExist: pass
1022             except PDCounterBook.DoesNotExist: pass
1023
1024         log.debug('search_tags: %s' % tags)
1025
1026         return tags
1027
1028     def hint_books(self, query, prefix=True):
1029         """
1030         Returns auto-complete hints for book titles
1031         Because we do not index 'pseudo' title-tags.
1032         Prefix search.
1033         """
1034         q = self.index.Q()
1035         query = query.strip()
1036         if prefix:
1037             q |= self.index.Q(title=query + "*")
1038         else:
1039             q |= self.make_term_query(query, field='title')
1040         qu = self.index.query(q)
1041         only_books = self.index.Q(is_book=True)
1042         return self.search_books(qu, [only_books])
1043
1044     def search_books(self, query, filters=None, max_results=10):
1045         """
1046         Searches for Book objects using query
1047         """
1048         bks = []
1049         res = self.apply_filters(query, filters).field_limit(['book_id'])
1050         for r in res:
1051             try:
1052                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1053             except catalogue.models.Book.DoesNotExist: pass
1054         return bks
1055
1056     # def make_prefix_phrase(self, toks, field):
1057     #     q = MultiPhraseQuery()
1058     #     for i in range(len(toks)):
1059     #         t = Term(field, toks[i])
1060     #         if i == len(toks) - 1:
1061     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1062     #             if pterms:
1063     #                 q.add(pterms)
1064     #             else:
1065     #                 q.add(t)
1066     #         else:
1067     #             q.add(t)
1068     #     return q
1069
1070     # @staticmethod
1071     # def term_filter(term, inverse=False):
1072     #     only_term = TermsFilter()
1073     #     only_term.addTerm(term)
1074
1075     #     if inverse:
1076     #         neg = BooleanFilter()
1077     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1078     #         only_term = neg
1079
1080     #     return only_term
1081
1082
1083
1084     @staticmethod
1085     def apply_filters(query, filters):
1086         """
1087         Apply filters to a query
1088         """
1089         if filters is None: filters = []
1090         filters = filter(lambda x: x is not None, filters)
1091         for f in filters:
1092             query = query.query(f)
1093         return query
1094
1095     # def filtered_categories(self, tags):
1096     #     """
1097     #     Return a list of tag categories, present in tags list.
1098     #     """
1099     #     cats = {}
1100     #     for t in tags:
1101     #         cats[t.category] = True
1102     #     return cats.keys()
1103
1104     # def hint(self):
1105     #     return Hint(self)