apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21
  22 class SolrIndex(object):
  23     def __init__(self, mode=None):
  24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  25
  26
  27 class Snippets(object):
  28     """
  29     This class manages snippet files for indexed object (book)
  30     the snippets are concatenated together, and their positions and
  31     lengths are kept in lucene index fields.
  32     """
  33     SNIPPET_DIR = "snippets"
  34
  35     def __init__(self, book_id, revision=None):
  36         try:
  37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  38         except OSError as exc:
  39             if exc.errno == errno.EEXIST:
  40                 pass
  41             else: raise
  42         self.book_id = book_id
  43         self.revision = revision
  44         self.file = None
  45
  46     @property
  47     def path(self):
  48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  49         else: fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if not 'b' in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         self.file.close()
  96
  97     def remove(self):
  98         self.revision = None
  99         try:
 100             os.unlink(self.path)
 101             self.revision = 0
 102             while True:
 103                 self.revision += 1
 104                 os.unlink(self.path)
 105         except OSError:
 106             pass
 107
 108
 109 class Index(SolrIndex):
 110     """
 111     Class indexing books.
 112     """
 113     def __init__(self):
 114         super(Index, self).__init__()
 115
 116     def delete_query(self, *queries):
 117         """
 118         index.delete(queries=...) doesn't work, so let's reimplement it
 119         using deletion of list of uids.
 120         """
 121         uids = set()
 122         for q in queries:
 123             if isinstance(q, sunburnt.search.LuceneQuery):
 124                 q = self.index.query(q)
 125             q.field_limiter.update(['uid'])
 126             st = 0
 127             rows = 100
 128             while True:
 129                 ids = q.paginate(start=st, rows=rows).execute()
 130                 if not len(ids):
 131                     break
 132                 for res in ids:
 133                     uids.add(res['uid'])
 134                 st += rows
 135                 #        print "Will delete %s" % ','.join([x for x in uids])
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         remove_only = kw.get('remove_only', False)
 149         # first, remove tags from index.
 150         if tags:
 151             tag_qs = []
 152             for tag in tags:
 153                 q_id = self.index.Q(tag_id=tag.id)
 154
 155                 if isinstance(tag, PDCounterAuthor):
 156                     q_cat = self.index.Q(tag_category='pd_author')
 157                 elif isinstance(tag, PDCounterBook):
 158                     q_cat = self.index.Q(tag_category='pd_book')
 159                 else:
 160                     q_cat = self.index.Q(tag_category=tag.category)
 161
 162                 q_id_cat = self.index.Q(q_id & q_cat)
 163                 tag_qs.append(q_id_cat)
 164             self.delete_query(tag_qs)
 165         else:  # all
 166             q = self.index.Q(tag_id__any=True)
 167             self.delete_query(q)
 168
 169         if not remove_only:
 170             # then add them [all or just one passed]
 171             if not tags:
 172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 173                     PDCounterAuthor.objects.all(), \
 174                     PDCounterBook.objects.all())
 175
 176             for tag in tags:
 177                 if isinstance(tag, PDCounterAuthor):
 178                     doc = {
 179                         "tag_id": int(tag.id),
 180                         "tag_name": tag.name,
 181                         "tag_name_pl": tag.name,
 182                         "tag_category": 'pd_author',
 183                         "is_pdcounter": True,
 184                         "uid": "tag%d_pd_a" % tag.id
 185                         }
 186                 elif isinstance(tag, PDCounterBook):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.title,
 190                         "tag_name_pl": tag.title,
 191                         "tag_category": 'pd_book',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_b" % tag.id
 194                         }
 195                 else:
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.name,
 199                         "tag_name_pl": tag.name,
 200                         "tag_category": tag.category,
 201                         "is_pdcounter": False,
 202                         "uid": "tag%d" % tag.id
 203                         }
 204                 self.index.add(doc)
 205
 206     def create_book_doc(self, book):
 207         """
 208         Create a lucene document referring book id.
 209         """
 210         doc = {
 211             'book_id': int(book.id),
 212             }
 213         if book.parent is not None:
 214             doc["parent_id"] = int(book.parent.id)
 215         return doc
 216
 217     def remove_book(self, book_or_id, remove_snippets=True):
 218         """Removes a book from search index.
 219         book - Book instance."""
 220         if isinstance(book_or_id, catalogue.models.Book):
 221             book_id = book_or_id.id
 222         else:
 223             book_id = book_or_id
 224
 225         self.delete_query(self.index.Q(book_id=book_id))
 226
 227         if remove_snippets:
 228             snippets = Snippets(book_id)
 229             snippets.remove()
 230
 231     def index_book(self, book, book_info=None, overwrite=True):
 232         """
 233         Indexes the book.
 234         Creates a lucene document for extracted metadata
 235         and calls self.index_content() to index the contents of the book.
 236         """
 237         if overwrite:
 238             # we don't remove snippets, since they might be still needed by
 239             # threads using not reopened index
 240             self.remove_book(book, remove_snippets=False)
 241
 242         book_doc = self.create_book_doc(book)
 243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 244         # let's not index it - it's only used for extracting publish date
 245         if 'source_name' in meta_fields:
 246             del meta_fields['source_name']
 247
 248         for n, f in meta_fields.items():
 249             book_doc[n] = f
 250
 251         book_doc['uid'] = "book%s" % book_doc['book_id']
 252         self.index.add(book_doc)
 253         del book_doc
 254         book_fields = {
 255             'title': meta_fields['title'],
 256             'authors': meta_fields['authors'],
 257             'published_date': meta_fields['published_date']
 258             }
 259         if 'translators' in meta_fields:
 260             book_fields['translators'] = meta_fields['translators']
 261
 262         self.index_content(book, book_fields=book_fields)
 263
 264     master_tags = [
 265         'opowiadanie',
 266         'powiesc',
 267         'dramat_wierszowany_l',
 268         'dramat_wierszowany_lp',
 269         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 270         'wywiad',
 271         ]
 272
 273     ignore_content_tags = [
 274         'uwaga', 'extra',
 275         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 276         'didaskalia',
 277         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 278         ]
 279
 280     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 281
 282     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 283
 284     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 285
 286     def extract_metadata(self, book, book_info=None, dc_only=None):
 287         """
 288         Extract metadata from book and returns a map of fields keyed by fieldname
 289         """
 290         fields = {}
 291
 292         if book_info is None:
 293             book_info = dcparser.parse(open(book.xml_file.path))
 294
 295         fields['slug'] = book.slug
 296         fields['tags'] = [t.name  for t in book.tags]
 297         fields['is_book'] = True
 298
 299         # validator, name
 300         for field in dcparser.BookInfo.FIELDS:
 301             if dc_only and field.name not in dc_only:
 302                 continue
 303             if hasattr(book_info, field.name):
 304                 if not getattr(book_info, field.name):
 305                     continue
 306                 # since no type information is available, we use validator
 307                 type_indicator = field.validator
 308                 if type_indicator == dcparser.as_unicode:
 309                     s = getattr(book_info, field.name)
 310                     if field.multiple:
 311                         s = ', '.join(s)
 312                     fields[field.name] = s
 313                 elif type_indicator == dcparser.as_person:
 314                     p = getattr(book_info, field.name)
 315                     if isinstance(p, dcparser.Person):
 316                         persons = unicode(p)
 317                     else:
 318                         persons = ', '.join(map(unicode, p))
 319                     fields[field.name] = persons
 320                 elif type_indicator == dcparser.as_date:
 321                     dt = getattr(book_info, field.name)
 322                     fields[field.name] = dt
 323
 324         # get published date
 325         pd = None
 326         if hasattr(book_info, 'source_name') and book_info.source_name:
 327             match = self.published_date_re.search(book_info.source_name)
 328             if match is not None:
 329                 pd = str(match.groups()[0])
 330         if not pd: pd = ""
 331         fields["published_date"] = pd
 332
 333         return fields
 334
 335     # def add_gaps(self, fields, fieldname):
 336     #     """
 337     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 338     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 339     #     """
 340     #     def gap():
 341     #         while True:
 342     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 343     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 344
 345     def get_master(self, root):
 346         """
 347         Returns the first master tag from an etree.
 348         """
 349         for master in root.iter():
 350             if master.tag in self.master_tags:
 351                 return master
 352
 353     def index_content(self, book, book_fields={}):
 354         """
 355         Walks the book XML and extract content from it.
 356         Adds parts for each header tag and for each fragment.
 357         """
 358         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 359         root = wld.edoc.getroot()
 360
 361         master = self.get_master(root)
 362         if master is None:
 363             return []
 364
 365         def walker(node, ignore_tags=[]):
 366
 367             if node.tag not in ignore_tags:
 368                 yield node, None, None
 369                 if node.text is not None:
 370                     yield None, node.text, None
 371                 for child in list(node):
 372                     for b, t, e in walker(child):
 373                         yield b, t, e
 374                 yield None, None, node
 375
 376             if node.tail is not None:
 377                 yield None, node.tail, None
 378             return
 379
 380         def fix_format(text):
 381             #            separator = [u" ", u"\t", u".", u";", u","]
 382             if isinstance(text, list):
 383                 # need to join it first
 384                 text = filter(lambda s: s is not None, content)
 385                 text = u' '.join(text)
 386                 # for i in range(len(text)):
 387                 #     if i > 0:
 388                 #         if text[i][0] not in separator\
 389                 #             and text[i - 1][-1] not in separator:
 390                 #          text.insert(i, u" ")
 391
 392             return re.sub("(?m)/$", "", text)
 393
 394         def add_part(snippets, **fields):
 395             doc = self.create_book_doc(book)
 396             for n, v in book_fields.items():
 397                 doc[n] = v
 398
 399             doc['header_index'] = fields["header_index"]
 400             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 401             doc['header_type'] = fields['header_type']
 402
 403             doc['text'] = fields['text']
 404
 405             # snippets
 406             snip_pos = snippets.add(fields["text"])
 407
 408             doc['snippets_position'] = snip_pos[0]
 409             doc['snippets_length'] = snip_pos[1]
 410             if snippets.revision:
 411                 doc["snippets_revision"] = snippets.revision
 412
 413             if 'fragment_anchor' in fields:
 414                 doc["fragment_anchor"] = fields['fragment_anchor']
 415
 416             if 'themes' in fields:
 417                 doc['themes'] = fields['themes']
 418             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 419                                          doc['header_span'],
 420                                          doc.get('fragment_anchor', ''))
 421             return doc
 422
 423         def give_me_utf8(s):
 424             if isinstance(s, unicode):
 425                 return s.encode('utf-8')
 426             else:
 427                 return s
 428
 429         fragments = {}
 430         snippets = Snippets(book.id).open('w')
 431         try:
 432             for header, position in zip(list(master), range(len(master))):
 433
 434                 if header.tag in self.skip_header_tags:
 435                     continue
 436                 if header.tag is etree.Comment:
 437                     continue
 438
 439                 # section content
 440                 content = []
 441                 footnote = []
 442
 443                 def all_content(text):
 444                     for frag in fragments.values():
 445                         frag['text'].append(text)
 446                     content.append(text)
 447                 handle_text = [all_content]
 448
 449                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 450                     # handle footnotes
 451                     if start is not None and start.tag in self.footnote_tags:
 452                         footnote = []
 453
 454                         def collect_footnote(t):
 455                             footnote.append(t)
 456
 457                         handle_text.append(collect_footnote)
 458                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 459                         handle_text.pop()
 460                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 461                                        text=u''.join(footnote),
 462                                        is_footnote=True)
 463
 464                         self.index.add(doc)
 465                         #print "@ footnote text: %s" % footnote
 466                         footnote = []
 467
 468                     # handle fragments and themes.
 469                     if start is not None and start.tag == 'begin':
 470                         fid = start.attrib['id'][1:]
 471                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 472
 473                     # themes for this fragment
 474                     elif start is not None and start.tag == 'motyw':
 475                         fid = start.attrib['id'][1:]
 476                         handle_text.append(None)
 477                         if start.text is not None:
 478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 479                     elif end is not None and end.tag == 'motyw':
 480                         handle_text.pop()
 481
 482                     elif start is not None and start.tag == 'end':
 483                         fid = start.attrib['id'][1:]
 484                         if fid not in fragments:
 485                             continue  # a broken <end> node, skip it
 486                         frag = fragments[fid]
 487                         if frag['themes'] == []:
 488                             continue  # empty themes list.
 489                         del fragments[fid]
 490
 491                         doc = add_part(snippets,
 492                                        header_type=frag['start_header'],
 493                                        header_index=frag['start_section'],
 494                                        header_span=position - frag['start_section'] + 1,
 495                                        fragment_anchor=fid,
 496                                        text=fix_format(frag['text']),
 497                                        themes=frag['themes'])
 498                         #print '@ FRAG %s' % frag['content']
 499                         self.index.add(doc)
 500
 501                         # Collect content.
 502
 503                     if text is not None and handle_text is not []:
 504                         hdl = handle_text[-1]
 505                         if hdl is not None:
 506                             hdl(text)
 507
 508                         # in the end, add a section text.
 509                 doc = add_part(snippets, header_index=position,
 510                                header_type=header.tag, text=fix_format(content))
 511                 #print '@ CONTENT: %s' % fix_format(content)
 512
 513                 self.index.add(doc)
 514
 515         finally:
 516             snippets.close()
 517
 518
 519 class SearchResult(object):
 520     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 521         #        self.search = search
 522         self.boost = 1.0
 523         self._hits = []
 524         self._processed_hits = None  # processed hits
 525         self.snippets = []
 526         self.query_terms = query_terms
 527
 528         if 'score' in doc:
 529             self._score = doc['score']
 530         else:
 531             self._score = 0
 532
 533         self.book_id = int(doc["book_id"])
 534
 535         try:
 536             self.published_date = int(doc.get("published_date"))
 537         except ValueError:
 538             self.published_date = 0
 539
 540         # content hits
 541         header_type = doc.get("header_type", None)
 542         # we have a content hit in some header of fragment
 543         if header_type is not None:
 544             sec = (header_type, int(doc["header_index"]))
 545             header_span = doc['header_span']
 546             header_span = header_span is not None and int(header_span) or 1
 547             fragment = doc.get("fragment_anchor", None)
 548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 549             snippets_rev = doc['snippets_revision']
 550
 551             hit = (sec + (header_span,), fragment, self._score, {
 552                 'how_found': how_found,
 553                 'snippets_pos': snippets_pos,
 554                 'snippets_revision': snippets_rev,
 555                 'themes': doc.get('themes', []),
 556                 'themes_pl': doc.get('themes_pl', [])
 557                 })
 558
 559             self._hits.append(hit)
 560
 561     def __unicode__(self):
 562         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 563             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 564
 565     def __str__(self):
 566         return unicode(self).encode('utf-8')
 567
 568     @property
 569     def score(self):
 570         return self._score * self.boost
 571
 572     def merge(self, other):
 573         if self.book_id != other.book_id:
 574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 575         self._hits += other._hits
 576         if other.score > self.score:
 577             self._score = other._score
 578         return self
 579
 580     def get_book(self):
 581         if hasattr(self, '_book'):
 582             return self._book
 583         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 584         return self._book
 585
 586     book = property(get_book)
 587
 588     POSITION = 0
 589     FRAGMENT = 1
 590     POSITION_INDEX = 1
 591     POSITION_SPAN = 2
 592     SCORE = 2
 593     OTHER = 3
 594
 595     @property
 596     def hits(self):
 597         if self._processed_hits is not None:
 598             return self._processed_hits
 599
 600         # to sections and fragments
 601         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 602
 603         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 604
 605         # sections not covered by fragments
 606         sect = filter(lambda s: 0 == len(filter(
 607             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 608             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 609             frags)), sect)
 610
 611         hits = []
 612
 613         def remove_duplicates(lst, keyfn, compare):
 614             els = {}
 615             for e in lst:
 616                 eif = keyfn(e)
 617                 if eif in els:
 618                     if compare(els[eif], e) >= 1:
 619                         continue
 620                 els[eif] = e
 621             return els.values()
 622
 623         # remove fragments with duplicated fid's and duplicated snippets
 624         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 625         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 626         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 627
 628         # remove duplicate sections
 629         sections = {}
 630
 631         for s in sect:
 632             si = s[self.POSITION][self.POSITION_INDEX]
 633             # skip existing
 634             if si in sections:
 635                 if sections[si]['score'] >= s[self.SCORE]:
 636                     continue
 637
 638             m = {'score': s[self.SCORE],
 639                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 640                  }
 641             m.update(s[self.OTHER])
 642             sections[si] = m
 643
 644         hits = sections.values()
 645
 646         for f in frags:
 647             try:
 648                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 649             except catalogue.models.Fragment.DoesNotExist:
 650                 # stale index
 651                 continue
 652             print f
 653             # Figure out if we were searching for a token matching some word in theme name.
 654             themes = frag.tags.filter(category='theme')
 655             themes_hit = set()
 656             if self.query_terms is not None:
 657                 for i in range(0, len(f[self.OTHER]['themes'])):
 658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 659                     tms = map(unicode.lower, tms)
 660                     for qt in self.query_terms:
 661                         if qt in tms:
 662                             themes_hit.add(f[self.OTHER]['themes'][i])
 663                             break
 664
 665             def theme_by_name(n):
 666                 th = filter(lambda t: t.name == n, themes)
 667                 if th:
 668                     return th[0]
 669                 else:
 670                     return None
 671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 672
 673             m = {'score': f[self.SCORE],
 674                  'fragment': frag,
 675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 676                  'themes': themes,
 677                  'themes_hit': themes_hit
 678                  }
 679             m.update(f[self.OTHER])
 680             hits.append(m)
 681
 682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 683
 684         self._processed_hits = hits
 685
 686         return hits
 687
 688     @staticmethod
 689     def aggregate(*result_lists):
 690         books = {}
 691         for rl in result_lists:
 692             for r in rl:
 693                 if r.book_id in books:
 694                     books[r.book_id].merge(r)
 695                 else:
 696                     books[r.book_id] = r
 697         return books.values()
 698
 699     def __cmp__(self, other):
 700         c = cmp(self.score, other.score)
 701         if c == 0:
 702             # this is inverted, because earlier date is better
 703             return cmp(other.published_date, self.published_date)
 704         else:
 705             return c
 706
 707     def __len__(self):
 708         return len(self.hits)
 709
 710     def snippet_pos(self, idx=0):
 711         return self.hits[idx]['snippets_pos']
 712
 713     def snippet_revision(self, idx=0):
 714         try:
 715             return self.hits[idx]['snippets_revision']
 716         except:
 717             return None
 718
 719
 720 class Search(SolrIndex):
 721     """
 722     Search facilities.
 723     """
 724     def __init__(self, default_field="text"):
 725         super(Search, self).__init__()
 726
 727     # def get_tokens(self, searched, field='text', cached=None):
 728     #     """returns tokens analyzed by a proper (for a field) analyzer
 729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 731     #     """
 732     #     if cached is not None and field in cached:
 733     #         return cached[field]
 734
 735     #     if isinstance(searched, str) or isinstance(searched, unicode):
 736     #         searched = StringReader(searched)
 737     #     elif isinstance(searched, list):
 738     #         return searched
 739
 740     #     searched.reset()
 741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 742     #     toks = []
 743     #     while tokens.incrementToken():
 744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 745     #         toks.append(cta.toString())
 746
 747     #     if cached is not None:
 748     #         cached[field] = toks
 749
 750     #     return toks
 751
 752     # @staticmethod
 753     # def fuzziness(fuzzy):
 754     #     """Helper method to sanitize fuzziness"""
 755     #     if not fuzzy:
 756     #         return None
 757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 758     #         return fuzzy
 759     #     else:
 760     #         return 0.5
 761
 762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 763     #     """
 764     #     Return a PhraseQuery with a series of tokens.
 765     #     """
 766     #     if fuzzy:
 767     #         phrase = MultiPhraseQuery()
 768     #         for t in tokens:
 769     #             term = Term(field, t)
 770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 771     #             fuzzterms = []
 772
 773     #             while True:
 774     #                 ft = fuzzterm.term()
 775     #                 if ft:
 776     #                     fuzzterms.append(ft)
 777     #                 if not fuzzterm.next(): break
 778     #             if fuzzterms:
 779     #                 phrase.add(JArray('object')(fuzzterms, Term))
 780     #             else:
 781     #                 phrase.add(term)
 782     #     else:
 783     #         phrase = PhraseQuery()
 784     #         phrase.setSlop(slop)
 785     #         for t in tokens:
 786     #             term = Term(field, t)
 787     #             phrase.add(term)
 788     #     return phrase
 789
 790     def make_term_query(self, query, field='text', modal=operator.or_):
 791         """
 792         Returns term queries joined by boolean query.
 793         modal - applies to boolean query
 794         fuzzy - should the query by fuzzy.
 795         """
 796         q = self.index.Q()
 797         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 798                         query.split(r" ")), q)
 799
 800         return q
 801
 802     def search_phrase(self, searched, field='text', book=False,
 803                       filters=None,
 804                       snippets=False):
 805         if filters is None: filters = []
 806         if book: filters.append(self.index.Q(is_book=True))
 807
 808         q = self.index.query(**{field: searched})
 809         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 810         res = q.execute()
 811         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 812
 813     def search_some(self, searched, fields, book=True,
 814                     filters=None, snippets=True, query_terms=None):
 815         assert isinstance(fields, list)
 816         if filters is None: filters = []
 817         if book: filters.append(self.index.Q(is_book=True))
 818
 819         query = self.index.Q()
 820
 821         for fld in fields:
 822             query = self.index.Q(query | self.make_term_query(searched, fld))
 823
 824         query = self.index.query(query)
 825         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 826         res = query.execute()
 827         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 828
 829     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 830     #     """
 831     #     Search for perfect book matches. Just see if the query matches with some author or title,
 832     #     taking hints into account.
 833     #     """
 834     #     fields_to_search = ['authors', 'title']
 835     #     only_in = None
 836     #     if hint:
 837     #         if not hint.should_search_for_book():
 838     #             return []
 839     #         fields_to_search = hint.just_search_in(fields_to_search)
 840     #         only_in = hint.book_filter()
 841
 842     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 843
 844     #     books = []
 845     #     for q in qrys:
 846     #         top = self.searcher.search(q,
 847     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 848     #             max_results)
 849     #         for found in top.scoreDocs:
 850     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 851     #     return books
 852
 853     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 854     #     fields_to_search = ['tags', 'authors', 'title']
 855
 856     #     only_in = None
 857     #     if hint:
 858     #         if not hint.should_search_for_book():
 859     #             return []
 860     #         fields_to_search = hint.just_search_in(fields_to_search)
 861     #         only_in = hint.book_filter()
 862
 863     #     tokens = self.get_tokens(searched, field='SIMPLE')
 864
 865     #     q = BooleanQuery()
 866
 867     #     for fld in fields_to_search:
 868     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 869     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 870
 871     #     books = []
 872     #     top = self.searcher.search(q,
 873     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 874     #         max_results)
 875     #     for found in top.scoreDocs:
 876     #         books.append(SearchResult(self, found, how_found="search_book"))
 877
 878     #     return books
 879
 880     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 881     #     """
 882     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 883     #     some part/fragment of the book.
 884     #     """
 885     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 886
 887     #     flt = None
 888     #     if hint:
 889     #         flt = hint.part_filter()
 890
 891     #     books = []
 892     #     for q in qrys:
 893     #         top = self.searcher.search(q,
 894     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 895     #                                                        flt]),
 896     #                                    max_results)
 897     #         for found in top.scoreDocs:
 898     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 899
 900     #     return books
 901
 902     def search_everywhere(self, searched, query_terms=None):
 903         """
 904         Tries to use search terms to match different fields of book (or its parts).
 905         E.g. one word can be an author survey, another be a part of the title, and the rest
 906         are some words from third chapter.
 907         """
 908         books = []
 909         # content only query : themes x content
 910         q = self.make_term_query(searched, 'text')
 911         q_themes = self.make_term_query(searched, 'themes_pl')
 912
 913         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 914         res = query.execute()
 915
 916         for found in res:
 917             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 918
 919         # query themes/content x author/title/tags
 920         in_content = self.index.Q()
 921         in_meta = self.index.Q()
 922
 923         for fld in ['themes_pl', 'text']:
 924             in_content |= self.make_term_query(searched, field=fld)
 925
 926         for fld in ['tags', 'authors', 'title']:
 927             in_meta |= self.make_term_query(searched, field=fld)
 928
 929         q = in_content & in_meta
 930         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 931
 932         for found in res:
 933             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 934
 935         return books
 936
 937     def get_snippets(self, searchresult, query, field='text', num=1):
 938         """
 939         Returns a snippet for found scoreDoc.
 940         """
 941         maxnum = len(searchresult)
 942         if num is None or num < 0 or num > maxnum:
 943             num = maxnum
 944         book_id = searchresult.book_id
 945         revision = searchresult.snippet_revision()
 946         snippets = Snippets(book_id, revision=revision)
 947         snips = [None] * maxnum
 948         try:
 949             snippets.open()
 950             idx = 0
 951             while idx < maxnum and num > 0:
 952                 position, length = searchresult.snippet_pos(idx)
 953                 if position is None or length is None:
 954                     continue
 955                 text = snippets.get((int(position),
 956                                      int(length)))
 957                 print "== %s -- %s ==" % (query, text)
 958                 snip = self.index.highlight(text=text, field=field, q=query)
 959                 snips[idx] = snip
 960                 if snip:
 961                     num -= 1
 962                 idx += 1
 963
 964         except IOError, e:
 965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 966             return []
 967         finally:
 968             snippets.close()
 969
 970             # remove verse end markers..
 971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 972
 973         searchresult.snippets = snips
 974         return snips
 975
 976     def hint_tags(self, query, pdcounter=True, prefix=True):
 977         """
 978         Return auto-complete hints for tags
 979         using prefix search.
 980         """
 981         q = self.index.Q()
 982         query = query.strip()
 983         for field in ['tag_name', 'tag_name_pl']:
 984             if prefix:
 985                 q |= self.index.Q(**{field: query + "*"})
 986             else:
 987                 q |= self.make_term_query(query, field=field)
 988         qu = self.index.query(q).exclude(tag_category="book")
 989
 990         return self.search_tags(qu, pdcounter=pdcounter)
 991
 992     def search_tags(self, query, filters=None, pdcounter=False):
 993         """
 994         Search for Tag objects using query.
 995         """
 996         if not filters: filters = []
 997         if not pdcounter:
 998             filters.append(~self.index.Q(is_pdcounter=True))
 999         res = self.apply_filters(query, filters).execute()
1000
1001         tags = []
1002         for doc in res:
1003             is_pdcounter = doc.get('is_pdcounter', False)
1004             category = doc.get('tag_category')
1005             try:
1006                 if is_pdcounter == True:
1007                     if category == 'pd_author':
1008                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1009                     elif category == 'pd_book':
1010                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1011                         tag.category = 'pd_book'  # make it look more lik a tag.
1012                     else:
1013                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1014                 else:
1015                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016                     # don't add the pdcounter tag if same tag already exists
1017
1018                 tags.append(tag)
1019
1020             except catalogue.models.Tag.DoesNotExist: pass
1021             except PDCounterAuthor.DoesNotExist: pass
1022             except PDCounterBook.DoesNotExist: pass
1023
1024         log.debug('search_tags: %s' % tags)
1025
1026         return tags
1027
1028     def hint_books(self, query, prefix=True):
1029         """
1030         Returns auto-complete hints for book titles
1031         Because we do not index 'pseudo' title-tags.
1032         Prefix search.
1033         """
1034         q = self.index.Q()
1035         query = query.strip()
1036         if prefix:
1037             q |= self.index.Q(title=query + "*")
1038         else:
1039             q |= self.make_term_query(query, field='title')
1040         qu = self.index.query(q)
1041         only_books = self.index.Q(is_book=True)
1042         return self.search_books(qu, [only_books])
1043
1044     def search_books(self, query, filters=None, max_results=10):
1045         """
1046         Searches for Book objects using query
1047         """
1048         bks = []
1049         res = self.apply_filters(query, filters).field_limit(['book_id'])
1050         for r in res:
1051             try:
1052                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1053             except catalogue.models.Book.DoesNotExist: pass
1054         return bks
1055
1056     # def make_prefix_phrase(self, toks, field):
1057     #     q = MultiPhraseQuery()
1058     #     for i in range(len(toks)):
1059     #         t = Term(field, toks[i])
1060     #         if i == len(toks) - 1:
1061     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1062     #             if pterms:
1063     #                 q.add(pterms)
1064     #             else:
1065     #                 q.add(t)
1066     #         else:
1067     #             q.add(t)
1068     #     return q
1069
1070     # @staticmethod
1071     # def term_filter(term, inverse=False):
1072     #     only_term = TermsFilter()
1073     #     only_term.addTerm(term)
1074
1075     #     if inverse:
1076     #         neg = BooleanFilter()
1077     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1078     #         only_term = neg
1079
1080     #     return only_term
1081
1082
1083
1084     @staticmethod
1085     def apply_filters(query, filters):
1086         """
1087         Apply filters to a query
1088         """
1089         if filters is None: filters = []
1090         filters = filter(lambda x: x is not None, filters)
1091         for f in filters:
1092             query = query.query(f)
1093         return query
1094
1095     # def filtered_categories(self, tags):
1096     #     """
1097     #     Return a list of tag categories, present in tags list.
1098     #     """
1099     #     cats = {}
1100     #     for t in tags:
1101     #         cats[t.category] = True
1102     #     return cats.keys()
1103
1104     # def hint(self):
1105     #     return Hint(self)