apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21
  22 class SolrIndex(object):
  23     def __init__(self, mode=None):
  24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  25
  26
  27 class Snippets(object):
  28     """
  29     This class manages snippet files for indexed object (book)
  30     the snippets are concatenated together, and their positions and
  31     lengths are kept in lucene index fields.
  32     """
  33     SNIPPET_DIR = "snippets"
  34
  35     def __init__(self, book_id, revision=None):
  36         try:
  37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  38         except OSError as exc:
  39             if exc.errno == errno.EEXIST:
  40                 pass
  41             else: raise
  42         self.book_id = book_id
  43         self.revision = revision
  44         self.file = None
  45
  46     @property
  47     def path(self):
  48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  49         else: fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if not 'b' in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         self.file.close()
  96
  97     def remove(self):
  98         self.revision = None
  99         try:
 100             os.unlink(self.path)
 101             self.revision = 0
 102             while True:
 103                 self.revision += 1
 104                 os.unlink(self.path)
 105         except OSError:
 106             pass
 107
 108
 109 class Index(SolrIndex):
 110     """
 111     Class indexing books.
 112     """
 113     def __init__(self):
 114         super(Index, self).__init__(mode='rw')
 115
 116     def delete_query(self, *queries):
 117         """
 118         index.delete(queries=...) doesn't work, so let's reimplement it
 119         using deletion of list of uids.
 120         """
 121         uids = set()
 122         for q in queries:
 123             if isinstance(q, sunburnt.search.LuceneQuery):
 124                 q = self.index.query(q)
 125             q.field_limiter.update(['uid'])
 126             st = 0
 127             rows = 100
 128             while True:
 129                 ids = q.paginate(start=st, rows=rows).execute()
 130                 if not len(ids):
 131                     break
 132                 for res in ids:
 133                     uids.add(res['uid'])
 134                 st += rows
 135                 #        print "Will delete %s" % ','.join([x for x in uids])
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         remove_only = kw.get('remove_only', False)
 149         # first, remove tags from index.
 150         if tags:
 151             tag_qs = []
 152             for tag in tags:
 153                 q_id = self.index.Q(tag_id=tag.id)
 154
 155                 if isinstance(tag, PDCounterAuthor):
 156                     q_cat = self.index.Q(tag_category='pd_author')
 157                 elif isinstance(tag, PDCounterBook):
 158                     q_cat = self.index.Q(tag_category='pd_book')
 159                 else:
 160                     q_cat = self.index.Q(tag_category=tag.category)
 161
 162                 q_id_cat = self.index.Q(q_id & q_cat)
 163                 tag_qs.append(q_id_cat)
 164             self.delete_query(tag_qs)
 165         else:  # all
 166             q = self.index.Q(tag_id__any=True)
 167             self.delete_query(q)
 168
 169         if not remove_only:
 170             # then add them [all or just one passed]
 171             if not tags:
 172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 173                     PDCounterAuthor.objects.all(), \
 174                     PDCounterBook.objects.all())
 175
 176             for tag in tags:
 177                 if isinstance(tag, PDCounterAuthor):
 178                     doc = {
 179                         "tag_id": int(tag.id),
 180                         "tag_name": tag.name,
 181                         "tag_name_pl": tag.name,
 182                         "tag_category": 'pd_author',
 183                         "is_pdcounter": True,
 184                         "uid": "tag%d_pd_a" % tag.id
 185                         }
 186                 elif isinstance(tag, PDCounterBook):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.title,
 190                         "tag_name_pl": tag.title,
 191                         "tag_category": 'pd_book',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_b" % tag.id
 194                         }
 195                 else:
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.name,
 199                         "tag_name_pl": tag.name,
 200                         "tag_category": tag.category,
 201                         "is_pdcounter": False,
 202                         "uid": "tag%d" % tag.id
 203                         }
 204                 self.index.add(doc)
 205
 206     def create_book_doc(self, book):
 207         """
 208         Create a lucene document referring book id.
 209         """
 210         doc = {
 211             'book_id': int(book.id),
 212             }
 213         if book.parent is not None:
 214             doc["parent_id"] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 244         # let's not index it - it's only used for extracting publish date
 245         if 'source_name' in meta_fields:
 246             del meta_fields['source_name']
 247
 248         for n, f in meta_fields.items():
 249             book_doc[n] = f
 250
 251         book_doc['uid'] = "book%s" % book_doc['book_id']
 252         self.index.add(book_doc)
 253         del book_doc
 254         book_fields = {
 255             'title': meta_fields['title'],
 256             'authors': meta_fields['authors'],
 257             'published_date': meta_fields['published_date']
 258             }
 259         if 'translators' in meta_fields:
 260             book_fields['translators'] = meta_fields['translators']
 261
 262         self.index_content(book, book_fields=book_fields)
 263
 264     master_tags = [
 265         'opowiadanie',
 266         'powiesc',
 267         'dramat_wierszowany_l',
 268         'dramat_wierszowany_lp',
 269         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 270         'wywiad',
 271         ]
 272
 273     ignore_content_tags = [
 274         'uwaga', 'extra',
 275         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 276         'didaskalia',
 277         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 278         ]
 279
 280     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 281
 282     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 283
 284     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 285
 286     def extract_metadata(self, book, book_info=None, dc_only=None):
 287         """
 288         Extract metadata from book and returns a map of fields keyed by fieldname
 289         """
 290         fields = {}
 291
 292         if book_info is None:
 293             book_info = dcparser.parse(open(book.xml_file.path))
 294
 295         fields['slug'] = book.slug
 296         fields['tags'] = [t.name  for t in book.tags]
 297         fields['is_book'] = True
 298
 299         # validator, name
 300         for field in dcparser.BookInfo.FIELDS:
 301             if dc_only and field.name not in dc_only:
 302                 continue
 303             if hasattr(book_info, field.name):
 304                 if not getattr(book_info, field.name):
 305                     continue
 306                 # since no type information is available, we use validator
 307                 type_indicator = field.validator
 308                 if type_indicator == dcparser.as_unicode:
 309                     s = getattr(book_info, field.name)
 310                     if field.multiple:
 311                         s = ', '.join(s)
 312                     fields[field.name] = s
 313                 elif type_indicator == dcparser.as_person:
 314                     p = getattr(book_info, field.name)
 315                     if isinstance(p, dcparser.Person):
 316                         persons = unicode(p)
 317                     else:
 318                         persons = ', '.join(map(unicode, p))
 319                     fields[field.name] = persons
 320                 elif type_indicator == dcparser.as_date:
 321                     dt = getattr(book_info, field.name)
 322                     fields[field.name] = dt
 323
 324         # get published date
 325         pd = None
 326         if hasattr(book_info, 'source_name') and book_info.source_name:
 327             match = self.published_date_re.search(book_info.source_name)
 328             if match is not None:
 329                 pd = str(match.groups()[0])
 330         if not pd: pd = ""
 331         fields["published_date"] = pd
 332
 333         return fields
 334
 335     # def add_gaps(self, fields, fieldname):
 336     #     """
 337     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 338     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 339     #     """
 340     #     def gap():
 341     #         while True:
 342     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 343     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 344
 345     def get_master(self, root):
 346         """
 347         Returns the first master tag from an etree.
 348         """
 349         for master in root.iter():
 350             if master.tag in self.master_tags:
 351                 return master
 352
 353     def index_content(self, book, book_fields={}):
 354         """
 355         Walks the book XML and extract content from it.
 356         Adds parts for each header tag and for each fragment.
 357         """
 358         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 359         root = wld.edoc.getroot()
 360
 361         master = self.get_master(root)
 362         if master is None:
 363             return []
 364
 365         def walker(node, ignore_tags=[]):
 366
 367             if node.tag not in ignore_tags:
 368                 yield node, None, None
 369                 if node.text is not None:
 370                     yield None, node.text, None
 371                 for child in list(node):
 372                     for b, t, e in walker(child):
 373                         yield b, t, e
 374                 yield None, None, node
 375
 376             if node.tail is not None:
 377                 yield None, node.tail, None
 378             return
 379
 380         def fix_format(text):
 381             #            separator = [u" ", u"\t", u".", u";", u","]
 382             if isinstance(text, list):
 383                 # need to join it first
 384                 text = filter(lambda s: s is not None, content)
 385                 text = u' '.join(text)
 386                 # for i in range(len(text)):
 387                 #     if i > 0:
 388                 #         if text[i][0] not in separator\
 389                 #             and text[i - 1][-1] not in separator:
 390                 #          text.insert(i, u" ")
 391
 392             return re.sub("(?m)/$", "", text)
 393
 394         def add_part(snippets, **fields):
 395             doc = self.create_book_doc(book)
 396             for n, v in book_fields.items():
 397                 doc[n] = v
 398
 399             doc['header_index'] = fields["header_index"]
 400             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 401             doc['header_type'] = fields['header_type']
 402
 403             doc['text'] = fields['text']
 404
 405             # snippets
 406             snip_pos = snippets.add(fields["text"])
 407
 408             doc['snippets_position'] = snip_pos[0]
 409             doc['snippets_length'] = snip_pos[1]
 410             if snippets.revision:
 411                 doc["snippets_revision"] = snippets.revision
 412
 413             if 'fragment_anchor' in fields:
 414                 doc["fragment_anchor"] = fields['fragment_anchor']
 415
 416             if 'themes' in fields:
 417                 doc['themes'] = fields['themes']
 418             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 419                                          doc['header_span'],
 420                                          doc.get('fragment_anchor', ''))
 421             return doc
 422
 423         def give_me_utf8(s):
 424             if isinstance(s, unicode):
 425                 return s.encode('utf-8')
 426             else:
 427                 return s
 428
 429         fragments = {}
 430         snippets = Snippets(book.id).open('w')
 431         try:
 432             for header, position in zip(list(master), range(len(master))):
 433
 434                 if header.tag in self.skip_header_tags:
 435                     continue
 436                 if header.tag is etree.Comment:
 437                     continue
 438
 439                 # section content
 440                 content = []
 441                 footnote = []
 442
 443                 def all_content(text):
 444                     for frag in fragments.values():
 445                         frag['text'].append(text)
 446                     content.append(text)
 447                 handle_text = [all_content]
 448
 449                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 450                     # handle footnotes
 451                     if start is not None and start.tag in self.footnote_tags:
 452                         footnote = []
 453
 454                         def collect_footnote(t):
 455                             footnote.append(t)
 456
 457                         handle_text.append(collect_footnote)
 458                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 459                         handle_text.pop()
 460                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 461                                        text=u''.join(footnote),
 462                                        is_footnote=True)
 463                         self.index.add(doc)
 464                         #print "@ footnote text: %s" % footnote
 465                         footnote = []
 466
 467                     # handle fragments and themes.
 468                     if start is not None and start.tag == 'begin':
 469                         fid = start.attrib['id'][1:]
 470                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 471
 472                     # themes for this fragment
 473                     elif start is not None and start.tag == 'motyw':
 474                         fid = start.attrib['id'][1:]
 475                         handle_text.append(None)
 476                         if start.text is not None:
 477                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 478                     elif end is not None and end.tag == 'motyw':
 479                         handle_text.pop()
 480
 481                     elif start is not None and start.tag == 'end':
 482                         fid = start.attrib['id'][1:]
 483                         if fid not in fragments:
 484                             continue  # a broken <end> node, skip it
 485                         frag = fragments[fid]
 486                         if frag['themes'] == []:
 487                             continue  # empty themes list.
 488                         del fragments[fid]
 489
 490                         doc = add_part(snippets,
 491                                        header_type=frag['start_header'],
 492                                        header_index=frag['start_section'],
 493                                        header_span=position - frag['start_section'] + 1,
 494                                        fragment_anchor=fid,
 495                                        text=fix_format(frag['text']),
 496                                        themes=frag['themes'])
 497                         #print '@ FRAG %s' % frag['content']
 498                         self.index.add(doc)
 499
 500                         # Collect content.
 501
 502                     if text is not None and handle_text is not []:
 503                         hdl = handle_text[-1]
 504                         if hdl is not None:
 505                             hdl(text)
 506
 507                         # in the end, add a section text.
 508                 doc = add_part(snippets, header_index=position,
 509                                header_type=header.tag, text=fix_format(content))
 510                 #print '@ CONTENT: %s' % fix_format(content)
 511
 512                 self.index.add(doc)
 513
 514         finally:
 515             snippets.close()
 516
 517
 518 class SearchResult(object):
 519     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 520         #        self.search = search
 521         self.boost = 1.0
 522         self._hits = []
 523         self._processed_hits = None  # processed hits
 524         self.snippets = []
 525         self.query_terms = query_terms
 526
 527         if 'score' in doc:
 528             self._score = doc['score']
 529         else:
 530             self._score = 0
 531
 532         self.book_id = int(doc["book_id"])
 533
 534         try:
 535             self.published_date = int(doc.get("published_date"))
 536         except ValueError:
 537             self.published_date = 0
 538
 539         # content hits
 540         header_type = doc.get("header_type", None)
 541         # we have a content hit in some header of fragment
 542         if header_type is not None:
 543             sec = (header_type, int(doc["header_index"]))
 544             header_span = doc['header_span']
 545             header_span = header_span is not None and int(header_span) or 1
 546             fragment = doc.get("fragment_anchor", None)
 547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 548             snippets_rev = doc['snippets_revision']
 549
 550             hit = (sec + (header_span,), fragment, self._score, {
 551                 'how_found': how_found,
 552                 'snippets_pos': snippets_pos,
 553                 'snippets_revision': snippets_rev,
 554                 'themes': doc.get('themes', []),
 555                 'themes_pl': doc.get('themes_pl', [])
 556                 })
 557
 558             self._hits.append(hit)
 559
 560     def __unicode__(self):
 561         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 562             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 563
 564     def __str__(self):
 565         return unicode(self).encode('utf-8')
 566
 567     @property
 568     def score(self):
 569         return self._score * self.boost
 570
 571     def merge(self, other):
 572         if self.book_id != other.book_id:
 573             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 574         self._hits += other._hits
 575         if other.score > self.score:
 576             self._score = other._score
 577         return self
 578
 579     def get_book(self):
 580         if hasattr(self, '_book'):
 581             return self._book
 582         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 583         return self._book
 584
 585     book = property(get_book)
 586
 587     POSITION = 0
 588     FRAGMENT = 1
 589     POSITION_INDEX = 1
 590     POSITION_SPAN = 2
 591     SCORE = 2
 592     OTHER = 3
 593
 594     @property
 595     def hits(self):
 596         if self._processed_hits is not None:
 597             return self._processed_hits
 598
 599         # to sections and fragments
 600         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 601
 602         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 603
 604         # sections not covered by fragments
 605         sect = filter(lambda s: 0 == len(filter(
 606             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 607             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 608             frags)), sect)
 609
 610         hits = []
 611
 612         def remove_duplicates(lst, keyfn, compare):
 613             els = {}
 614             for e in lst:
 615                 eif = keyfn(e)
 616                 if eif in els:
 617                     if compare(els[eif], e) >= 1:
 618                         continue
 619                 els[eif] = e
 620             return els.values()
 621
 622         # remove fragments with duplicated fid's and duplicated snippets
 623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 626
 627         # remove duplicate sections
 628         sections = {}
 629
 630         for s in sect:
 631             si = s[self.POSITION][self.POSITION_INDEX]
 632             # skip existing
 633             if si in sections:
 634                 if sections[si]['score'] >= s[self.SCORE]:
 635                     continue
 636
 637             m = {'score': s[self.SCORE],
 638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 639                  }
 640             m.update(s[self.OTHER])
 641             sections[si] = m
 642
 643         hits = sections.values()
 644
 645         for f in frags:
 646             try:
 647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 648             except catalogue.models.Fragment.DoesNotExist:
 649                 # stale index
 650                 continue
 651             # Figure out if we were searching for a token matching some word in theme name.
 652             themes = frag.tags.filter(category='theme')
 653             themes_hit = set()
 654             if self.query_terms is not None:
 655                 for i in range(0, len(f[self.OTHER]['themes'])):
 656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 657                     tms = map(unicode.lower, tms)
 658                     for qt in self.query_terms:
 659                         if qt in tms:
 660                             themes_hit.add(f[self.OTHER]['themes'][i])
 661                             break
 662
 663             def theme_by_name(n):
 664                 th = filter(lambda t: t.name == n, themes)
 665                 if th:
 666                     return th[0]
 667                 else:
 668                     return None
 669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 670
 671             m = {'score': f[self.SCORE],
 672                  'fragment': frag,
 673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 674                  'themes': themes,
 675                  'themes_hit': themes_hit
 676                  }
 677             m.update(f[self.OTHER])
 678             hits.append(m)
 679
 680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 681
 682         self._processed_hits = hits
 683
 684         return hits
 685
 686     @staticmethod
 687     def aggregate(*result_lists):
 688         books = {}
 689         for rl in result_lists:
 690             for r in rl:
 691                 if r.book_id in books:
 692                     books[r.book_id].merge(r)
 693                 else:
 694                     books[r.book_id] = r
 695         return books.values()
 696
 697     def __cmp__(self, other):
 698         c = cmp(self.score, other.score)
 699         if c == 0:
 700             # this is inverted, because earlier date is better
 701             return cmp(other.published_date, self.published_date)
 702         else:
 703             return c
 704
 705     def __len__(self):
 706         return len(self.hits)
 707
 708     def snippet_pos(self, idx=0):
 709         return self.hits[idx]['snippets_pos']
 710
 711     def snippet_revision(self, idx=0):
 712         try:
 713             return self.hits[idx]['snippets_revision']
 714         except:
 715             return None
 716
 717
 718 class Search(SolrIndex):
 719     """
 720     Search facilities.
 721     """
 722     def __init__(self, default_field="text"):
 723         super(Search, self).__init__(mode='r')
 724
 725     # def get_tokens(self, searched, field='text', cached=None):
 726     #     """returns tokens analyzed by a proper (for a field) analyzer
 727     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 728     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 729     #     """
 730     #     if cached is not None and field in cached:
 731     #         return cached[field]
 732
 733     #     if isinstance(searched, str) or isinstance(searched, unicode):
 734     #         searched = StringReader(searched)
 735     #     elif isinstance(searched, list):
 736     #         return searched
 737
 738     #     searched.reset()
 739     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 740     #     toks = []
 741     #     while tokens.incrementToken():
 742     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 743     #         toks.append(cta.toString())
 744
 745     #     if cached is not None:
 746     #         cached[field] = toks
 747
 748     #     return toks
 749
 750     # @staticmethod
 751     # def fuzziness(fuzzy):
 752     #     """Helper method to sanitize fuzziness"""
 753     #     if not fuzzy:
 754     #         return None
 755     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 756     #         return fuzzy
 757     #     else:
 758     #         return 0.5
 759
 760     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 761     #     """
 762     #     Return a PhraseQuery with a series of tokens.
 763     #     """
 764     #     if fuzzy:
 765     #         phrase = MultiPhraseQuery()
 766     #         for t in tokens:
 767     #             term = Term(field, t)
 768     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 769     #             fuzzterms = []
 770
 771     #             while True:
 772     #                 ft = fuzzterm.term()
 773     #                 if ft:
 774     #                     fuzzterms.append(ft)
 775     #                 if not fuzzterm.next(): break
 776     #             if fuzzterms:
 777     #                 phrase.add(JArray('object')(fuzzterms, Term))
 778     #             else:
 779     #                 phrase.add(term)
 780     #     else:
 781     #         phrase = PhraseQuery()
 782     #         phrase.setSlop(slop)
 783     #         for t in tokens:
 784     #             term = Term(field, t)
 785     #             phrase.add(term)
 786     #     return phrase
 787
 788     def make_term_query(self, query, field='text', modal=operator.or_):
 789         """
 790         Returns term queries joined by boolean query.
 791         modal - applies to boolean query
 792         fuzzy - should the query by fuzzy.
 793         """
 794         q = self.index.Q()
 795         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 796                         query.split(r" ")), q)
 797
 798         return q
 799
 800     def search_phrase(self, searched, field='text', book=False,
 801                       filters=None,
 802                       snippets=False):
 803         if filters is None: filters = []
 804         if book: filters.append(self.index.Q(is_book=True))
 805
 806         q = self.index.query(**{field: searched})
 807         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 808         res = q.execute()
 809         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 810
 811     def search_some(self, searched, fields, book=True,
 812                     filters=None, snippets=True, query_terms=None):
 813         assert isinstance(fields, list)
 814         if filters is None: filters = []
 815         if book: filters.append(self.index.Q(is_book=True))
 816
 817         query = self.index.Q()
 818
 819         for fld in fields:
 820             query = self.index.Q(query | self.make_term_query(searched, fld))
 821
 822         query = self.index.query(query)
 823         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 824         res = query.execute()
 825         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 826
 827     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 828     #     """
 829     #     Search for perfect book matches. Just see if the query matches with some author or title,
 830     #     taking hints into account.
 831     #     """
 832     #     fields_to_search = ['authors', 'title']
 833     #     only_in = None
 834     #     if hint:
 835     #         if not hint.should_search_for_book():
 836     #             return []
 837     #         fields_to_search = hint.just_search_in(fields_to_search)
 838     #         only_in = hint.book_filter()
 839
 840     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 841
 842     #     books = []
 843     #     for q in qrys:
 844     #         top = self.searcher.search(q,
 845     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 846     #             max_results)
 847     #         for found in top.scoreDocs:
 848     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 849     #     return books
 850
 851     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 852     #     fields_to_search = ['tags', 'authors', 'title']
 853
 854     #     only_in = None
 855     #     if hint:
 856     #         if not hint.should_search_for_book():
 857     #             return []
 858     #         fields_to_search = hint.just_search_in(fields_to_search)
 859     #         only_in = hint.book_filter()
 860
 861     #     tokens = self.get_tokens(searched, field='SIMPLE')
 862
 863     #     q = BooleanQuery()
 864
 865     #     for fld in fields_to_search:
 866     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 867     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 868
 869     #     books = []
 870     #     top = self.searcher.search(q,
 871     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 872     #         max_results)
 873     #     for found in top.scoreDocs:
 874     #         books.append(SearchResult(self, found, how_found="search_book"))
 875
 876     #     return books
 877
 878     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 879     #     """
 880     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 881     #     some part/fragment of the book.
 882     #     """
 883     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 884
 885     #     flt = None
 886     #     if hint:
 887     #         flt = hint.part_filter()
 888
 889     #     books = []
 890     #     for q in qrys:
 891     #         top = self.searcher.search(q,
 892     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 893     #                                                        flt]),
 894     #                                    max_results)
 895     #         for found in top.scoreDocs:
 896     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 897
 898     #     return books
 899
 900     def search_everywhere(self, searched, query_terms=None):
 901         """
 902         Tries to use search terms to match different fields of book (or its parts).
 903         E.g. one word can be an author survey, another be a part of the title, and the rest
 904         are some words from third chapter.
 905         """
 906         books = []
 907         # content only query : themes x content
 908         q = self.make_term_query(searched, 'text')
 909         q_themes = self.make_term_query(searched, 'themes_pl')
 910
 911         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 912         res = query.execute()
 913
 914         for found in res:
 915             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 916
 917         # query themes/content x author/title/tags
 918         in_content = self.index.Q()
 919         in_meta = self.index.Q()
 920
 921         for fld in ['themes_pl', 'text']:
 922             in_content |= self.make_term_query(searched, field=fld)
 923
 924         for fld in ['tags', 'authors', 'title']:
 925             in_meta |= self.make_term_query(searched, field=fld)
 926
 927         q = in_content & in_meta
 928         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 929
 930         for found in res:
 931             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 932
 933         return books
 934
 935     def get_snippets(self, searchresult, query, field='text', num=1):
 936         """
 937         Returns a snippet for found scoreDoc.
 938         """
 939         maxnum = len(searchresult)
 940         if num is None or num < 0 or num > maxnum:
 941             num = maxnum
 942         book_id = searchresult.book_id
 943         revision = searchresult.snippet_revision()
 944         snippets = Snippets(book_id, revision=revision)
 945         snips = [None] * maxnum
 946         try:
 947             snippets.open()
 948             idx = 0
 949             while idx < maxnum and num > 0:
 950                 position, length = searchresult.snippet_pos(idx)
 951                 if position is None or length is None:
 952                     continue
 953                 text = snippets.get((int(position),
 954                                      int(length)))
 955                 snip = self.index.highlight(text=text, field=field, q=query)
 956                 snips[idx] = snip
 957                 if snip:
 958                     num -= 1
 959                 idx += 1
 960
 961         except IOError, e:
 962             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 963             return []
 964         finally:
 965             snippets.close()
 966
 967             # remove verse end markers..
 968         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 969
 970         searchresult.snippets = snips
 971
 972         return snips
 973
 974     def hint_tags(self, query, pdcounter=True, prefix=True):
 975         """
 976         Return auto-complete hints for tags
 977         using prefix search.
 978         """
 979         q = self.index.Q()
 980         query = query.strip()
 981         for field in ['tag_name', 'tag_name_pl']:
 982             if prefix:
 983                 q |= self.index.Q(**{field: query + "*"})
 984             else:
 985                 q |= self.make_term_query(query, field=field)
 986         qu = self.index.query(q).exclude(tag_category="book")
 987
 988         return self.search_tags(qu, pdcounter=pdcounter)
 989
 990     def search_tags(self, query, filters=None, pdcounter=False):
 991         """
 992         Search for Tag objects using query.
 993         """
 994         if not filters: filters = []
 995         if not pdcounter:
 996             filters.append(~self.index.Q(is_pdcounter=True))
 997         res = self.apply_filters(query, filters).execute()
 998
 999         tags = []
1000         for doc in res:
1001             is_pdcounter = doc.get('is_pdcounter', False)
1002             category = doc.get('tag_category')
1003             try:
1004                 if is_pdcounter == True:
1005                     if category == 'pd_author':
1006                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1007                     elif category == 'pd_book':
1008                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1009                         tag.category = 'pd_book'  # make it look more lik a tag.
1010                     else:
1011                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1012                 else:
1013                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1014                     # don't add the pdcounter tag if same tag already exists
1015
1016                 tags.append(tag)
1017
1018             except catalogue.models.Tag.DoesNotExist: pass
1019             except PDCounterAuthor.DoesNotExist: pass
1020             except PDCounterBook.DoesNotExist: pass
1021
1022         log.debug('search_tags: %s' % tags)
1023
1024         return tags
1025
1026     def hint_books(self, query, prefix=True):
1027         """
1028         Returns auto-complete hints for book titles
1029         Because we do not index 'pseudo' title-tags.
1030         Prefix search.
1031         """
1032         q = self.index.Q()
1033         query = query.strip()
1034         if prefix:
1035             q |= self.index.Q(title=query + "*")
1036         else:
1037             q |= self.make_term_query(query, field='title')
1038         qu = self.index.query(q)
1039         only_books = self.index.Q(is_book=True)
1040         return self.search_books(qu, [only_books])
1041
1042     def search_books(self, query, filters=None, max_results=10):
1043         """
1044         Searches for Book objects using query
1045         """
1046         bks = []
1047         res = self.apply_filters(query, filters).field_limit(['book_id'])
1048         for r in res:
1049             try:
1050                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1051             except catalogue.models.Book.DoesNotExist: pass
1052         return bks
1053
1054     # def make_prefix_phrase(self, toks, field):
1055     #     q = MultiPhraseQuery()
1056     #     for i in range(len(toks)):
1057     #         t = Term(field, toks[i])
1058     #         if i == len(toks) - 1:
1059     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1060     #             if pterms:
1061     #                 q.add(pterms)
1062     #             else:
1063     #                 q.add(t)
1064     #         else:
1065     #             q.add(t)
1066     #     return q
1067
1068     # @staticmethod
1069     # def term_filter(term, inverse=False):
1070     #     only_term = TermsFilter()
1071     #     only_term.addTerm(term)
1072
1073     #     if inverse:
1074     #         neg = BooleanFilter()
1075     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1076     #         only_term = neg
1077
1078     #     return only_term
1079
1080
1081
1082     @staticmethod
1083     def apply_filters(query, filters):
1084         """
1085         Apply filters to a query
1086         """
1087         if filters is None: filters = []
1088         filters = filter(lambda x: x is not None, filters)
1089         for f in filters:
1090             query = query.query(f)
1091         return query
1092
1093     # def filtered_categories(self, tags):
1094     #     """
1095     #     Return a list of tag categories, present in tags list.
1096     #     """
1097     #     cats = {}
1098     #     for t in tags:
1099     #         cats[t.category] = True
1100     #     return cats.keys()
1101
1102     # def hint(self):
1103     #     return Hint(self)