apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21
  22 class SolrIndex(object):
  23     def __init__(self, mode=None):
  24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  25
  26
  27 class Snippets(object):
  28     """
  29     This class manages snippet files for indexed object (book)
  30     the snippets are concatenated together, and their positions and
  31     lengths are kept in lucene index fields.
  32     """
  33     SNIPPET_DIR = "snippets"
  34
  35     def __init__(self, book_id, revision=None):
  36         try:
  37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  38         except OSError as exc:
  39             if exc.errno == errno.EEXIST:
  40                 pass
  41             else: raise
  42         self.book_id = book_id
  43         self.revision = revision
  44         self.file = None
  45
  46     @property
  47     def path(self):
  48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  49         else: fn = "%d" % self.book_id
  50
  51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  52
  53     def open(self, mode='r'):
  54         """
  55         Open the snippet file. Call .close() afterwards.
  56         """
  57         if not 'b' in mode:
  58             mode += 'b'
  59
  60         if 'w' in mode:
  61             if os.path.exists(self.path):
  62                 self.revision = 1
  63                 while True:
  64                     if not os.path.exists(self.path):
  65                         break
  66                     self.revision += 1
  67
  68         self.file = open(self.path, mode)
  69         self.position = 0
  70         return self
  71
  72     def add(self, snippet):
  73         """
  74         Append a snippet (unicode) to the snippet file.
  75         Return a (position, length) tuple
  76         """
  77         txt = snippet.encode('utf-8')
  78         l = len(txt)
  79         self.file.write(txt)
  80         pos = (self.position, l)
  81         self.position += l
  82         return pos
  83
  84     def get(self, pos):
  85         """
  86         Given a tuple of (position, length) return an unicode
  87         of the snippet stored there.
  88         """
  89         self.file.seek(pos[0], 0)
  90         txt = self.file.read(pos[1]).decode('utf-8')
  91         return txt
  92
  93     def close(self):
  94         """Close snippet file"""
  95         self.file.close()
  96
  97     def remove(self):
  98         self.revision = None
  99         try:
 100             os.unlink(self.path)
 101             self.revision = 0
 102             while True:
 103                 self.revision += 1
 104                 os.unlink(self.path)
 105         except OSError:
 106             pass
 107
 108
 109 class Index(SolrIndex):
 110     """
 111     Class indexing books.
 112     """
 113     def __init__(self):
 114         super(Index, self).__init__()
 115
 116     def delete_query(self, *queries):
 117         """
 118         index.delete(queries=...) doesn't work, so let's reimplement it
 119         using deletion of list of uids.
 120         """
 121         uids = set()
 122         for q in queries:
 123             if isinstance(q, sunburnt.search.LuceneQuery):
 124                 q = self.index.query(q)
 125             q.field_limiter.update(['uid'])
 126             st = 0
 127             rows = 100
 128             while True:
 129                 ids = q.paginate(start=st, rows=rows).execute()
 130                 if not len(ids):
 131                     break
 132                 for res in ids:
 133                     uids.add(res['uid'])
 134                 st += rows
 135                 #        print "Will delete %s" % ','.join([x for x in uids])
 136         if uids:
 137             self.index.delete(uids)
 138             return True
 139         else:
 140             return False
 141
 142     def index_tags(self, *tags, **kw):
 143         """
 144         Re-index global tag list.
 145         Removes all tags from index, then index them again.
 146         Indexed fields include: id, name (with and without polish stems), category
 147         """
 148         remove_only = kw.get('remove_only', False)
 149         # first, remove tags from index.
 150         if tags:
 151             tag_qs = []
 152             for tag in tags:
 153                 q_id = self.index.Q(tag_id=tag.id)
 154
 155                 if isinstance(tag, PDCounterAuthor):
 156                     q_cat = self.index.Q(tag_category='pd_author')
 157                 elif isinstance(tag, PDCounterBook):
 158                     q_cat = self.index.Q(tag_category='pd_book')
 159                 else:
 160                     q_cat = self.index.Q(tag_category=tag.category)
 161
 162                 q_id_cat = self.index.Q(q_id & q_cat)
 163                 tag_qs.append(q_id_cat)
 164             self.delete_query(tag_qs)
 165         else:  # all
 166             q = self.index.Q(tag_id__any=True)
 167             self.delete_query(q)
 168
 169         if not remove_only:
 170             # then add them [all or just one passed]
 171             if not tags:
 172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 173                     PDCounterAuthor.objects.all(), \
 174                     PDCounterBook.objects.all())
 175
 176             for tag in tags:
 177                 if isinstance(tag, PDCounterAuthor):
 178                     doc = {
 179                         "tag_id": int(tag.id),
 180                         "tag_name": tag.name,
 181                         "tag_name_pl": tag.name,
 182                         "tag_category": 'pd_author',
 183                         "is_pdcounter": True,
 184                         "uid": "tag%d_pd_a" % tag.id
 185                         }
 186                 elif isinstance(tag, PDCounterBook):
 187                     doc = {
 188                         "tag_id": int(tag.id),
 189                         "tag_name": tag.title,
 190                         "tag_name_pl": tag.title,
 191                         "tag_category": 'pd_book',
 192                         "is_pdcounter": True,
 193                         "uid": "tag%d_pd_b" % tag.id
 194                         }
 195                 else:
 196                     doc = {
 197                         "tag_id": int(tag.id),
 198                         "tag_name": tag.name,
 199                         "tag_name_pl": tag.name,
 200                         "tag_category": tag.category,
 201                         "is_pdcounter": False,
 202                         "uid": "tag%d" % tag.id
 203                         }
 204                 self.index.add(doc)
 205                 print "%s %s" % (doc['tag_name'], doc['tag_category'])
 206
 207     def create_book_doc(self, book):
 208         """
 209         Create a lucene document referring book id.
 210         """
 211         doc = {
 212             'book_id': int(book.id),
 213             }
 214         if book.parent is not None:
 215             doc["parent_id"] = int(book.parent.id)
 216         return doc
 217
 218     def remove_book(self, book_or_id, remove_snippets=True):
 219         """Removes a book from search index.
 220         book - Book instance."""
 221         if isinstance(book_or_id, catalogue.models.Book):
 222             book_id = book_or_id.id
 223         else:
 224             book_id = book_or_id
 225
 226         self.delete_query(self.index.Q(book_id=book_id))
 227
 228         if remove_snippets:
 229             snippets = Snippets(book_id)
 230             snippets.remove()
 231
 232     def index_book(self, book, book_info=None, overwrite=True):
 233         """
 234         Indexes the book.
 235         Creates a lucene document for extracted metadata
 236         and calls self.index_content() to index the contents of the book.
 237         """
 238         if overwrite:
 239             # we don't remove snippets, since they might be still needed by
 240             # threads using not reopened index
 241             self.remove_book(book, remove_snippets=False)
 242
 243         book_doc = self.create_book_doc(book)
 244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
 245         # let's not index it - it's only used for extracting publish date
 246         if 'source_name' in meta_fields:
 247             del meta_fields['source_name']
 248
 249         for n, f in meta_fields.items():
 250             book_doc[n] = f
 251
 252         book_doc['uid'] = "book%s" % book_doc['book_id']
 253         self.index.add(book_doc)
 254         del book_doc
 255         book_fields = {
 256             'title': meta_fields['title'],
 257             'authors': meta_fields['authors'],
 258             'published_date': meta_fields['published_date']
 259             }
 260         if 'translators' in meta_fields:
 261             book_fields['translators'] = meta_fields['translators']
 262
 263         self.index_content(book, book_fields=book_fields)
 264
 265     master_tags = [
 266         'opowiadanie',
 267         'powiesc',
 268         'dramat_wierszowany_l',
 269         'dramat_wierszowany_lp',
 270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 271         'wywiad',
 272         ]
 273
 274     ignore_content_tags = [
 275         'uwaga', 'extra',
 276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 277         'didaskalia',
 278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 279         ]
 280
 281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 282
 283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 284
 285     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 286
 287     def extract_metadata(self, book, book_info=None, dc_only=None):
 288         """
 289         Extract metadata from book and returns a map of fields keyed by fieldname
 290         """
 291         fields = {}
 292
 293         if book_info is None:
 294             book_info = dcparser.parse(open(book.xml_file.path))
 295
 296         fields['slug'] = book.slug
 297         fields['tags'] = [t.name  for t in book.tags]
 298         fields['is_book'] = True
 299
 300         # validator, name
 301         for field in dcparser.BookInfo.FIELDS:
 302             if dc_only and field.name not in dc_only:
 303                 continue
 304             if hasattr(book_info, field.name):
 305                 if not getattr(book_info, field.name):
 306                     continue
 307                 # since no type information is available, we use validator
 308                 type_indicator = field.validator
 309                 if type_indicator == dcparser.as_unicode:
 310                     s = getattr(book_info, field.name)
 311                     if field.multiple:
 312                         s = ', '.join(s)
 313                     fields[field.name] = s
 314                 elif type_indicator == dcparser.as_person:
 315                     p = getattr(book_info, field.name)
 316                     if isinstance(p, dcparser.Person):
 317                         persons = unicode(p)
 318                     else:
 319                         persons = ', '.join(map(unicode, p))
 320                     fields[field.name] = persons
 321                 elif type_indicator == dcparser.as_date:
 322                     dt = getattr(book_info, field.name)
 323                     fields[field.name] = dt
 324
 325         # get published date
 326         pd = None
 327         if hasattr(book_info, 'source_name') and book_info.source_name:
 328             match = self.published_date_re.search(book_info.source_name)
 329             if match is not None:
 330                 pd = str(match.groups()[0])
 331         if not pd: pd = ""
 332         fields["published_date"] = pd
 333
 334         return fields
 335
 336     # def add_gaps(self, fields, fieldname):
 337     #     """
 338     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 339     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 340     #     """
 341     #     def gap():
 342     #         while True:
 343     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 344     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 345
 346     def get_master(self, root):
 347         """
 348         Returns the first master tag from an etree.
 349         """
 350         for master in root.iter():
 351             if master.tag in self.master_tags:
 352                 return master
 353
 354     def index_content(self, book, book_fields={}):
 355         """
 356         Walks the book XML and extract content from it.
 357         Adds parts for each header tag and for each fragment.
 358         """
 359         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 360         root = wld.edoc.getroot()
 361
 362         master = self.get_master(root)
 363         if master is None:
 364             return []
 365
 366         def walker(node, ignore_tags=[]):
 367
 368             if node.tag not in ignore_tags:
 369                 yield node, None, None
 370                 if node.text is not None:
 371                     yield None, node.text, None
 372                 for child in list(node):
 373                     for b, t, e in walker(child):
 374                         yield b, t, e
 375                 yield None, None, node
 376
 377             if node.tail is not None:
 378                 yield None, node.tail, None
 379             return
 380
 381         def fix_format(text):
 382             #            separator = [u" ", u"\t", u".", u";", u","]
 383             if isinstance(text, list):
 384                 # need to join it first
 385                 text = filter(lambda s: s is not None, content)
 386                 text = u' '.join(text)
 387                 # for i in range(len(text)):
 388                 #     if i > 0:
 389                 #         if text[i][0] not in separator\
 390                 #             and text[i - 1][-1] not in separator:
 391                 #          text.insert(i, u" ")
 392
 393             return re.sub("(?m)/$", "", text)
 394
 395         def add_part(snippets, **fields):
 396             doc = self.create_book_doc(book)
 397             for n, v in book_fields.items():
 398                 doc[n] = v
 399
 400             doc['header_index'] = fields["header_index"]
 401             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 402             doc['header_type'] = fields['header_type']
 403
 404             doc['text'] = fields['text']
 405
 406             # snippets
 407             snip_pos = snippets.add(fields["text"])
 408
 409             doc['snippets_position'] = snip_pos[0]
 410             doc['snippets_length'] = snip_pos[1]
 411             if snippets.revision:
 412                 doc["snippets_revision"] = snippets.revision
 413
 414             if 'fragment_anchor' in fields:
 415                 doc["fragment_anchor"] = fields['fragment_anchor']
 416
 417             if 'themes' in fields:
 418                 doc['themes'] = fields['themes']
 419             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 420                                          doc['header_span'],
 421                                          doc.get('fragment_anchor', ''))
 422             return doc
 423
 424         def give_me_utf8(s):
 425             if isinstance(s, unicode):
 426                 return s.encode('utf-8')
 427             else:
 428                 return s
 429
 430         fragments = {}
 431         snippets = Snippets(book.id).open('w')
 432         try:
 433             for header, position in zip(list(master), range(len(master))):
 434
 435                 if header.tag in self.skip_header_tags:
 436                     continue
 437                 if header.tag is etree.Comment:
 438                     continue
 439
 440                 # section content
 441                 content = []
 442                 footnote = []
 443
 444                 def all_content(text):
 445                     for frag in fragments.values():
 446                         frag['text'].append(text)
 447                     content.append(text)
 448                 handle_text = [all_content]
 449
 450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 451                     # handle footnotes
 452                     if start is not None and start.tag in self.footnote_tags:
 453                         footnote = []
 454
 455                         def collect_footnote(t):
 456                             footnote.append(t)
 457
 458                         handle_text.append(collect_footnote)
 459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 460                         handle_text.pop()
 461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 462                                        text=u''.join(footnote),
 463                                        is_footnote=True)
 464
 465                         self.index.add(doc)
 466                         #print "@ footnote text: %s" % footnote
 467                         footnote = []
 468
 469                     # handle fragments and themes.
 470                     if start is not None and start.tag == 'begin':
 471                         fid = start.attrib['id'][1:]
 472                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 473
 474                     # themes for this fragment
 475                     elif start is not None and start.tag == 'motyw':
 476                         fid = start.attrib['id'][1:]
 477                         handle_text.append(None)
 478                         if start.text is not None:
 479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 480                     elif end is not None and end.tag == 'motyw':
 481                         handle_text.pop()
 482
 483                     elif start is not None and start.tag == 'end':
 484                         fid = start.attrib['id'][1:]
 485                         if fid not in fragments:
 486                             continue  # a broken <end> node, skip it
 487                         frag = fragments[fid]
 488                         if frag['themes'] == []:
 489                             continue  # empty themes list.
 490                         del fragments[fid]
 491
 492                         doc = add_part(snippets,
 493                                        header_type=frag['start_header'],
 494                                        header_index=frag['start_section'],
 495                                        header_span=position - frag['start_section'] + 1,
 496                                        fragment_anchor=fid,
 497                                        text=fix_format(frag['text']),
 498                                        themes=frag['themes'])
 499                         #print '@ FRAG %s' % frag['content']
 500                         self.index.add(doc)
 501
 502                         # Collect content.
 503
 504                     if text is not None and handle_text is not []:
 505                         hdl = handle_text[-1]
 506                         if hdl is not None:
 507                             hdl(text)
 508
 509                         # in the end, add a section text.
 510                 doc = add_part(snippets, header_index=position,
 511                                header_type=header.tag, text=fix_format(content))
 512                 #print '@ CONTENT: %s' % fix_format(content)
 513
 514                 self.index.add(doc)
 515
 516         finally:
 517             snippets.close()
 518
 519
 520 class SearchResult(object):
 521     def __init__(self, doc, how_found=None, query=None):
 522         #        self.search = search
 523         self.boost = 1.0
 524         self._hits = []
 525         self._processed_hits = None  # processed hits
 526         self.snippets = []
 527
 528         if 'score' in doc:
 529             self._score = doc['score']
 530         else:
 531             self._score = 0
 532
 533         self.book_id = int(doc["book_id"])
 534
 535         try:
 536             self.published_date = int(doc.get("published_date"))
 537         except ValueError:
 538             self.published_date = 0
 539
 540         # content hits
 541         header_type = doc.get("header_type", None)
 542         # we have a content hit in some header of fragment
 543         if header_type is not None:
 544             sec = (header_type, int(doc["header_index"]))
 545             header_span = doc['header_span']
 546             header_span = header_span is not None and int(header_span) or 1
 547             fragment = doc.get("fragment_anchor", None)
 548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 549             snippets_rev = doc['snippets_revision']
 550
 551             hit = (sec + (header_span,), fragment, self._score, {
 552                 'how_found': how_found,
 553                 'snippets_pos': snippets_pos,
 554                 'snippets_revision': snippets_rev
 555                 })
 556
 557             self._hits.append(hit)
 558
 559     def __unicode__(self):
 560         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 561             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 562
 563     def __str__(self):
 564         return unicode(self).encode('utf-8')
 565
 566     @property
 567     def score(self):
 568         return self._score * self.boost
 569
 570     def merge(self, other):
 571         if self.book_id != other.book_id:
 572             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 573         self._hits += other._hits
 574         if other.score > self.score:
 575             self._score = other._score
 576         return self
 577
 578     def get_book(self):
 579         if hasattr(self, '_book'):
 580             return self._book
 581         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 582         return self._book
 583
 584     book = property(get_book)
 585
 586     POSITION = 0
 587     FRAGMENT = 1
 588     POSITION_INDEX = 1
 589     POSITION_SPAN = 2
 590     SCORE = 2
 591     OTHER = 3
 592
 593     @property
 594     def hits(self):
 595         if self._processed_hits is not None:
 596             return self._processed_hits
 597
 598         # to sections and fragments
 599         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 600
 601         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 602
 603         # sections not covered by fragments
 604         sect = filter(lambda s: 0 == len(filter(
 605             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 606             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 607             frags)), sect)
 608
 609         hits = []
 610
 611         def remove_duplicates(lst, keyfn, compare):
 612             els = {}
 613             for e in lst:
 614                 eif = keyfn(e)
 615                 if eif in els:
 616                     if compare(els[eif], e) >= 1:
 617                         continue
 618                 els[eif] = e
 619             return els.values()
 620
 621         # remove fragments with duplicated fid's and duplicated snippets
 622         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 623         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 624         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 625
 626         # remove duplicate sections
 627         sections = {}
 628
 629         for s in sect:
 630             si = s[self.POSITION][self.POSITION_INDEX]
 631             # skip existing
 632             if si in sections:
 633                 if sections[si]['score'] >= s[self.SCORE]:
 634                     continue
 635
 636             m = {'score': s[self.SCORE],
 637                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 638                  }
 639             m.update(s[self.OTHER])
 640             sections[si] = m
 641
 642         hits = sections.values()
 643
 644         for f in frags:
 645             try:
 646                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 647             except catalogue.models.Fragment.DoesNotExist:
 648                 # stale index
 649                 continue
 650
 651             # Figure out if we were searching for a token matching some word in theme name.
 652             themes = frag.tags.filter(category='theme')
 653             themes_hit = []
 654             # if self.searched is not None:
 655             #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 656             #     for theme in themes:
 657             #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 658             #         for t in tokens:
 659             #             if t in name_tokens:
 660             #                 if not theme in themes_hit:
 661             #                     themes_hit.append(theme)
 662             #                 break
 663
 664             m = {'score': f[self.SCORE],
 665                  'fragment': frag,
 666                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 667                  'themes': themes,
 668                  'themes_hit': themes_hit
 669                  }
 670             m.update(f[self.OTHER])
 671             hits.append(m)
 672
 673         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 674
 675         self._processed_hits = hits
 676
 677         return hits
 678
 679     @staticmethod
 680     def aggregate(*result_lists):
 681         books = {}
 682         for rl in result_lists:
 683             for r in rl:
 684                 if r.book_id in books:
 685                     books[r.book_id].merge(r)
 686                 else:
 687                     books[r.book_id] = r
 688         return books.values()
 689
 690     def __cmp__(self, other):
 691         c = cmp(self.score, other.score)
 692         if c == 0:
 693             # this is inverted, because earlier date is better
 694             return cmp(other.published_date, self.published_date)
 695         else:
 696             return c
 697
 698     def __len__(self):
 699         return len(self.hits)
 700
 701     def snippet_pos(self, idx=0):
 702         return self.hits[idx]['snippets_pos']
 703
 704     def snippet_revision(self, idx=0):
 705         try:
 706             return self.hits[idx]['snippets_revision']
 707         except:
 708             return None
 709
 710
 711 class Search(SolrIndex):
 712     """
 713     Search facilities.
 714     """
 715     def __init__(self, default_field="text"):
 716         super(Search, self).__init__()
 717
 718     # def get_tokens(self, searched, field='text', cached=None):
 719     #     """returns tokens analyzed by a proper (for a field) analyzer
 720     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 721     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 722     #     """
 723     #     if cached is not None and field in cached:
 724     #         return cached[field]
 725
 726     #     if isinstance(searched, str) or isinstance(searched, unicode):
 727     #         searched = StringReader(searched)
 728     #     elif isinstance(searched, list):
 729     #         return searched
 730
 731     #     searched.reset()
 732     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 733     #     toks = []
 734     #     while tokens.incrementToken():
 735     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 736     #         toks.append(cta.toString())
 737
 738     #     if cached is not None:
 739     #         cached[field] = toks
 740
 741     #     return toks
 742
 743     # @staticmethod
 744     # def fuzziness(fuzzy):
 745     #     """Helper method to sanitize fuzziness"""
 746     #     if not fuzzy:
 747     #         return None
 748     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 749     #         return fuzzy
 750     #     else:
 751     #         return 0.5
 752
 753     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 754     #     """
 755     #     Return a PhraseQuery with a series of tokens.
 756     #     """
 757     #     if fuzzy:
 758     #         phrase = MultiPhraseQuery()
 759     #         for t in tokens:
 760     #             term = Term(field, t)
 761     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 762     #             fuzzterms = []
 763
 764     #             while True:
 765     #                 ft = fuzzterm.term()
 766     #                 if ft:
 767     #                     fuzzterms.append(ft)
 768     #                 if not fuzzterm.next(): break
 769     #             if fuzzterms:
 770     #                 phrase.add(JArray('object')(fuzzterms, Term))
 771     #             else:
 772     #                 phrase.add(term)
 773     #     else:
 774     #         phrase = PhraseQuery()
 775     #         phrase.setSlop(slop)
 776     #         for t in tokens:
 777     #             term = Term(field, t)
 778     #             phrase.add(term)
 779     #     return phrase
 780
 781     def make_term_query(self, query, field='text', modal=operator.or_):
 782         """
 783         Returns term queries joined by boolean query.
 784         modal - applies to boolean query
 785         fuzzy - should the query by fuzzy.
 786         """
 787         q = self.index.Q()
 788         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 789                         query.split(r" ")), q)
 790
 791         return q
 792
 793     def search_phrase(self, searched, field='text', book=False,
 794                       filters=None,
 795                       snippets=False):
 796         if filters is None: filters = []
 797         if book: filters.append(self.index.Q(is_book=True))
 798
 799         q = self.index.query(**{field: searched})
 800         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 801         res = q.execute()
 802         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 803
 804     def search_some(self, searched, fields, book=True,
 805                     filters=None,
 806                     snippets=True):
 807         assert isinstance(fields, list)
 808         if filters is None: filters = []
 809         if book: filters.append(self.index.Q(is_book=True))
 810
 811         query = self.index.Q()
 812
 813         for fld in fields:
 814             query = self.index.Q(query | self.make_term_query(searched, fld))
 815
 816         query = self.index.query(query)
 817         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 818         res = query.execute()
 819         return [SearchResult(found, how_found='search_some') for found in res]
 820
 821     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 822     #     """
 823     #     Search for perfect book matches. Just see if the query matches with some author or title,
 824     #     taking hints into account.
 825     #     """
 826     #     fields_to_search = ['authors', 'title']
 827     #     only_in = None
 828     #     if hint:
 829     #         if not hint.should_search_for_book():
 830     #             return []
 831     #         fields_to_search = hint.just_search_in(fields_to_search)
 832     #         only_in = hint.book_filter()
 833
 834     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 835
 836     #     books = []
 837     #     for q in qrys:
 838     #         top = self.searcher.search(q,
 839     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 840     #             max_results)
 841     #         for found in top.scoreDocs:
 842     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 843     #     return books
 844
 845     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 846     #     fields_to_search = ['tags', 'authors', 'title']
 847
 848     #     only_in = None
 849     #     if hint:
 850     #         if not hint.should_search_for_book():
 851     #             return []
 852     #         fields_to_search = hint.just_search_in(fields_to_search)
 853     #         only_in = hint.book_filter()
 854
 855     #     tokens = self.get_tokens(searched, field='SIMPLE')
 856
 857     #     q = BooleanQuery()
 858
 859     #     for fld in fields_to_search:
 860     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 861     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 862
 863     #     books = []
 864     #     top = self.searcher.search(q,
 865     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 866     #         max_results)
 867     #     for found in top.scoreDocs:
 868     #         books.append(SearchResult(self, found, how_found="search_book"))
 869
 870     #     return books
 871
 872     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 873     #     """
 874     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 875     #     some part/fragment of the book.
 876     #     """
 877     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 878
 879     #     flt = None
 880     #     if hint:
 881     #         flt = hint.part_filter()
 882
 883     #     books = []
 884     #     for q in qrys:
 885     #         top = self.searcher.search(q,
 886     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 887     #                                                        flt]),
 888     #                                    max_results)
 889     #         for found in top.scoreDocs:
 890     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 891
 892     #     return books
 893
 894     def search_everywhere(self, searched):
 895         """
 896         Tries to use search terms to match different fields of book (or its parts).
 897         E.g. one word can be an author survey, another be a part of the title, and the rest
 898         are some words from third chapter.
 899         """
 900         books = []
 901         # content only query : themes x content
 902
 903         q = self.make_term_query(searched, 'text')
 904         q_themes = self.make_term_query(searched, 'themes_pl')
 905
 906         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 907         res = query.execute()
 908
 909         for found in res:
 910             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
 911
 912         # query themes/content x author/title/tags
 913         in_content = self.index.Q()
 914         in_meta = self.index.Q()
 915
 916         for fld in ['themes_pl', 'text']:
 917             in_content |= self.make_term_query(searched, field=fld)
 918
 919         for fld in ['tags', 'authors', 'title']:
 920             in_meta |= self.make_term_query(searched, field=fld)
 921
 922         q = in_content & in_meta
 923         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 924         for found in res:
 925             books.append(SearchResult(found, how_found='search_everywhere'))
 926
 927         return books
 928
 929     def get_snippets(self, searchresult, query, field='text', num=1):
 930         """
 931         Returns a snippet for found scoreDoc.
 932         """
 933         maxnum = len(searchresult)
 934         if num is None or num < 0 or num > maxnum:
 935             num = maxnum
 936         book_id = searchresult.book_id
 937         revision = searchresult.snippet_revision()
 938         snippets = Snippets(book_id, revision=revision)
 939         snips = [None] * maxnum
 940         try:
 941             snippets.open()
 942             idx = 0
 943             while idx < maxnum and num > 0:
 944                 position, length = searchresult.snippet_pos(idx)
 945                 if position is None or length is None:
 946                     continue
 947                 text = snippets.get((int(position),
 948                                      int(length)))
 949                 print "== %s -- %s ==" % (query, text)
 950                 snip = self.index.highlight(text=text, field=field, q=query)
 951                 snips[idx] = snip
 952                 if snip:
 953                     num -= 1
 954                 idx += 1
 955
 956         except IOError, e:
 957             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 958             return []
 959         finally:
 960             snippets.close()
 961
 962             # remove verse end markers..
 963         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 964
 965         searchresult.snippets = snips
 966         return snips
 967
 968     def hint_tags(self, query, pdcounter=True, prefix=True):
 969         """
 970         Return auto-complete hints for tags
 971         using prefix search.
 972         """
 973         q = self.index.Q()
 974         query = query.strip()
 975         for field in ['tag_name', 'tag_name_pl']:
 976             if prefix:
 977                 q |= self.index.Q(**{field: query + "*"})
 978             else:
 979                 q |= self.make_term_query(query, field=field)
 980         qu = self.index.query(q).exclude(tag_category="book")
 981
 982         return self.search_tags(qu, pdcounter=pdcounter)
 983
 984     def search_tags(self, query, filters=None, pdcounter=False):
 985         """
 986         Search for Tag objects using query.
 987         """
 988         if not filters: filters = []
 989         if not pdcounter:
 990             filters.append(~self.index.Q(is_pdcounter=True))
 991         res = self.apply_filters(query, filters).execute()
 992
 993         tags = []
 994         for doc in res:
 995             is_pdcounter = doc.get('is_pdcounter', False)
 996             category = doc.get('tag_category')
 997             try:
 998                 if is_pdcounter == True:
 999                     if category == 'pd_author':
1000                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1001                     elif category == 'pd_book':
1002                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1003                         tag.category = 'pd_book'  # make it look more lik a tag.
1004                     else:
1005                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1006                 else:
1007                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1008                     # don't add the pdcounter tag if same tag already exists
1009
1010                 tags.append(tag)
1011
1012             except catalogue.models.Tag.DoesNotExist: pass
1013             except PDCounterAuthor.DoesNotExist: pass
1014             except PDCounterBook.DoesNotExist: pass
1015
1016         log.debug('search_tags: %s' % tags)
1017
1018         return tags
1019
1020     def hint_books(self, query, prefix=True):
1021         """
1022         Returns auto-complete hints for book titles
1023         Because we do not index 'pseudo' title-tags.
1024         Prefix search.
1025         """
1026         q = self.index.Q()
1027         query = query.strip()
1028         if prefix:
1029             q |= self.index.Q(title=query + "*")
1030         else:
1031             q |= self.make_term_query(query, field='title')
1032         qu = self.index.query(q)
1033         only_books = self.index.Q(is_book=True)
1034         return self.search_books(qu, [only_books])
1035
1036     def search_books(self, query, filters=None, max_results=10):
1037         """
1038         Searches for Book objects using query
1039         """
1040         bks = []
1041         res = self.apply_filters(query, filters).field_limit(['book_id'])
1042         for r in res:
1043             try:
1044                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1045             except catalogue.models.Book.DoesNotExist: pass
1046         return bks
1047
1048     # def make_prefix_phrase(self, toks, field):
1049     #     q = MultiPhraseQuery()
1050     #     for i in range(len(toks)):
1051     #         t = Term(field, toks[i])
1052     #         if i == len(toks) - 1:
1053     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1054     #             if pterms:
1055     #                 q.add(pterms)
1056     #             else:
1057     #                 q.add(t)
1058     #         else:
1059     #             q.add(t)
1060     #     return q
1061
1062     # @staticmethod
1063     # def term_filter(term, inverse=False):
1064     #     only_term = TermsFilter()
1065     #     only_term.addTerm(term)
1066
1067     #     if inverse:
1068     #         neg = BooleanFilter()
1069     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1070     #         only_term = neg
1071
1072     #     return only_term
1073
1074
1075
1076     @staticmethod
1077     def apply_filters(query, filters):
1078         """
1079         Apply filters to a query
1080         """
1081         if filters is None: filters = []
1082         filters = filter(lambda x: x is not None, filters)
1083         for f in filters:
1084             query = query.query(f)
1085         return query
1086
1087     # def filtered_categories(self, tags):
1088     #     """
1089     #     Return a list of tag categories, present in tags list.
1090     #     """
1091     #     cats = {}
1092     #     for t in tags:
1093     #         cats[t.category] = True
1094     #     return cats.keys()
1095
1096     # def hint(self):
1097     #     return Hint(self)