apps/search/index.py

   1 # -*- coding: utf-8 -*-
   2
   3 from django.conf import settings
   4
   5 import os
   6 import re
   7 import errno
   8 from librarian import dcparser
   9 from librarian.parser import WLDocument
  10 from lxml import etree
  11 import catalogue.models
  12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  13 from itertools import chain
  14 import traceback
  15 import logging
  16 log = logging.getLogger('search')
  17 import sunburnt
  18 import custom
  19 import operator
  20
  21 log = logging.getLogger('search')
  22
  23 class SolrIndex(object):
  24     def __init__(self, mode=None):
  25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  26
  27
  28 class Snippets(object):
  29     """
  30     This class manages snippet files for indexed object (book)
  31     the snippets are concatenated together, and their positions and
  32     lengths are kept in lucene index fields.
  33     """
  34     SNIPPET_DIR = "snippets"
  35
  36     def __init__(self, book_id, revision=None):
  37         try:
  38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  39         except OSError as exc:
  40             if exc.errno == errno.EEXIST:
  41                 pass
  42             else: raise
  43         self.book_id = book_id
  44         self.revision = revision
  45         self.file = None
  46
  47     @property
  48     def path(self):
  49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
  50         else: fn = "%d" % self.book_id
  51
  52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  53
  54     def open(self, mode='r'):
  55         """
  56         Open the snippet file. Call .close() afterwards.
  57         """
  58         if not 'b' in mode:
  59             mode += 'b'
  60
  61         if 'w' in mode:
  62             if os.path.exists(self.path):
  63                 self.revision = 1
  64                 while True:
  65                     if not os.path.exists(self.path):
  66                         break
  67                     self.revision += 1
  68
  69         self.file = open(self.path, mode)
  70         self.position = 0
  71         return self
  72
  73     def add(self, snippet):
  74         """
  75         Append a snippet (unicode) to the snippet file.
  76         Return a (position, length) tuple
  77         """
  78         txt = snippet.encode('utf-8')
  79         l = len(txt)
  80         self.file.write(txt)
  81         pos = (self.position, l)
  82         self.position += l
  83         return pos
  84
  85     def get(self, pos):
  86         """
  87         Given a tuple of (position, length) return an unicode
  88         of the snippet stored there.
  89         """
  90         self.file.seek(pos[0], 0)
  91         txt = self.file.read(pos[1]).decode('utf-8')
  92         return txt
  93
  94     def close(self):
  95         """Close snippet file"""
  96         self.file.close()
  97
  98     def remove(self):
  99         self.revision = None
 100         try:
 101             os.unlink(self.path)
 102             self.revision = 0
 103             while True:
 104                 self.revision += 1
 105                 os.unlink(self.path)
 106         except OSError:
 107             pass
 108
 109
 110 class Index(SolrIndex):
 111     """
 112     Class indexing books.
 113     """
 114     def __init__(self):
 115         super(Index, self).__init__(mode='rw')
 116
 117     def delete_query(self, *queries):
 118         """
 119         index.delete(queries=...) doesn't work, so let's reimplement it
 120         using deletion of list of uids.
 121         """
 122         uids = set()
 123         for q in queries:
 124             if isinstance(q, sunburnt.search.LuceneQuery):
 125                 q = self.index.query(q)
 126             q.field_limiter.update(['uid'])
 127             st = 0
 128             rows = 100
 129             while True:
 130                 ids = q.paginate(start=st, rows=rows).execute()
 131                 if not len(ids):
 132                     break
 133                 for res in ids:
 134                     uids.add(res['uid'])
 135                 st += rows
 136                 #        print "Will delete %s" % ','.join([x for x in uids])
 137         if uids:
 138             self.index.delete(uids)
 139             return True
 140         else:
 141             return False
 142
 143     def index_tags(self, *tags, **kw):
 144         """
 145         Re-index global tag list.
 146         Removes all tags from index, then index them again.
 147         Indexed fields include: id, name (with and without polish stems), category
 148         """
 149         log.debug("Indexing tags")
 150         remove_only = kw.get('remove_only', False)
 151         # first, remove tags from index.
 152         if tags:
 153             tag_qs = []
 154             for tag in tags:
 155                 q_id = self.index.Q(tag_id=tag.id)
 156
 157                 if isinstance(tag, PDCounterAuthor):
 158                     q_cat = self.index.Q(tag_category='pd_author')
 159                 elif isinstance(tag, PDCounterBook):
 160                     q_cat = self.index.Q(tag_category='pd_book')
 161                 else:
 162                     q_cat = self.index.Q(tag_category=tag.category)
 163
 164                 q_id_cat = self.index.Q(q_id & q_cat)
 165                 tag_qs.append(q_id_cat)
 166             self.delete_query(tag_qs)
 167         else:  # all
 168             q = self.index.Q(tag_id__any=True)
 169             self.delete_query(q)
 170
 171         if not remove_only:
 172             # then add them [all or just one passed]
 173             if not tags:
 174                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
 175                     PDCounterAuthor.objects.all(), \
 176                     PDCounterBook.objects.all())
 177
 178             for tag in tags:
 179                 if isinstance(tag, PDCounterAuthor):
 180                     doc = {
 181                         "tag_id": int(tag.id),
 182                         "tag_name": tag.name,
 183                         "tag_name_pl": tag.name,
 184                         "tag_category": 'pd_author',
 185                         "is_pdcounter": True,
 186                         "uid": "tag%d_pd_a" % tag.id
 187                         }
 188                 elif isinstance(tag, PDCounterBook):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.title,
 192                         "tag_name_pl": tag.title,
 193                         "tag_category": 'pd_book',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_b" % tag.id
 196                         }
 197                 else:
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.name,
 201                         "tag_name_pl": tag.name,
 202                         "tag_category": tag.category,
 203                         "is_pdcounter": False,
 204                         "uid": "tag%d" % tag.id
 205                         }
 206                 self.index.add(doc)
 207
 208     def create_book_doc(self, book):
 209         """
 210         Create a lucene document referring book id.
 211         """
 212         doc = {
 213             'book_id': int(book.id),
 214             }
 215         if book.parent is not None:
 216             doc["parent_id"] = int(book.parent.id)
 217         return doc
 218
 219     def remove_book(self, book_or_id, remove_snippets=True):
 220         """Removes a book from search index.
 221         book - Book instance."""
 222         if isinstance(book_or_id, catalogue.models.Book):
 223             book_id = book_or_id.id
 224         else:
 225             book_id = book_or_id
 226
 227         self.delete_query(self.index.Q(book_id=book_id))
 228
 229         if remove_snippets:
 230             snippets = Snippets(book_id)
 231             snippets.remove()
 232
 233     def index_book(self, book, book_info=None, overwrite=True):
 234         """
 235         Indexes the book.
 236         Creates a lucene document for extracted metadata
 237         and calls self.index_content() to index the contents of the book.
 238         """
 239         if overwrite:
 240             # we don't remove snippets, since they might be still needed by
 241             # threads using not reopened index
 242             self.remove_book(book, remove_snippets=False)
 243
 244         book_doc = self.create_book_doc(book)
 245         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
 246         # let's not index it - it's only used for extracting publish date
 247         if 'source_name' in meta_fields:
 248             del meta_fields['source_name']
 249
 250         for n, f in meta_fields.items():
 251             book_doc[n] = f
 252
 253         book_doc['uid'] = "book%s" % book_doc['book_id']
 254         self.index.add(book_doc)
 255         del book_doc
 256         book_fields = {
 257             'title': meta_fields['title'],
 258             'authors': meta_fields['authors'],
 259             'published_date': meta_fields['published_date']
 260             }
 261
 262         if 'translators' in meta_fields:
 263             book_fields['translators'] = meta_fields['translators']
 264
 265         self.index_content(book, book_fields=book_fields)
 266
 267     master_tags = [
 268         'opowiadanie',
 269         'powiesc',
 270         'dramat_wierszowany_l',
 271         'dramat_wierszowany_lp',
 272         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 273         'wywiad',
 274         ]
 275
 276     ignore_content_tags = [
 277         'uwaga', 'extra',
 278         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 279         'didaskalia',
 280         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 281         ]
 282
 283     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 284
 285     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 286
 287     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 288
 289     def extract_metadata(self, book, book_info=None, dc_only=None):
 290         """
 291         Extract metadata from book and returns a map of fields keyed by fieldname
 292         """
 293         fields = {}
 294
 295         if book_info is None:
 296             book_info = dcparser.parse(open(book.xml_file.path))
 297
 298         fields['slug'] = book.slug
 299         fields['tags'] = [t.name  for t in book.tags]
 300         fields['is_book'] = True
 301
 302         # validator, name
 303         for field in dcparser.BookInfo.FIELDS:
 304             if dc_only and field.name not in dc_only:
 305                 continue
 306             if hasattr(book_info, field.name):
 307                 if not getattr(book_info, field.name):
 308                     continue
 309                 # since no type information is available, we use validator
 310                 type_indicator = field.validator
 311                 if type_indicator == dcparser.as_unicode:
 312                     s = getattr(book_info, field.name)
 313                     if field.multiple:
 314                         s = ', '.join(s)
 315                     fields[field.name] = s
 316                 elif type_indicator == dcparser.as_person:
 317                     p = getattr(book_info, field.name)
 318                     if isinstance(p, dcparser.Person):
 319                         persons = unicode(p)
 320                     else:
 321                         persons = ', '.join(map(unicode, p))
 322                     fields[field.name] = persons
 323                 elif type_indicator == dcparser.as_date:
 324                     dt = getattr(book_info, field.name)
 325                     fields[field.name] = dt
 326
 327         # get published date
 328         pd = None
 329         if hasattr(book_info, 'source_name') and book_info.source_name:
 330             match = self.published_date_re.search(book_info.source_name)
 331             if match is not None:
 332                 pd = str(match.groups()[0])
 333         if not pd: pd = ""
 334         fields["published_date"] = pd
 335
 336         return fields
 337
 338     # def add_gaps(self, fields, fieldname):
 339     #     """
 340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 342     #     """
 343     #     def gap():
 344     #         while True:
 345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 347
 348     def get_master(self, root):
 349         """
 350         Returns the first master tag from an etree.
 351         """
 352         for master in root.iter():
 353             if master.tag in self.master_tags:
 354                 return master
 355
 356     def index_content(self, book, book_fields={}):
 357         """
 358         Walks the book XML and extract content from it.
 359         Adds parts for each header tag and for each fragment.
 360         """
 361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 362         root = wld.edoc.getroot()
 363
 364         master = self.get_master(root)
 365         if master is None:
 366             return []
 367
 368         def walker(node, ignore_tags=[]):
 369
 370             if node.tag not in ignore_tags:
 371                 yield node, None, None
 372                 if node.text is not None:
 373                     yield None, node.text, None
 374                 for child in list(node):
 375                     for b, t, e in walker(child):
 376                         yield b, t, e
 377                 yield None, None, node
 378
 379             if node.tail is not None:
 380                 yield None, node.tail, None
 381             return
 382
 383         def fix_format(text):
 384             #            separator = [u" ", u"\t", u".", u";", u","]
 385             if isinstance(text, list):
 386                 # need to join it first
 387                 text = filter(lambda s: s is not None, content)
 388                 text = u' '.join(text)
 389                 # for i in range(len(text)):
 390                 #     if i > 0:
 391                 #         if text[i][0] not in separator\
 392                 #             and text[i - 1][-1] not in separator:
 393                 #          text.insert(i, u" ")
 394
 395             return re.sub("(?m)/$", "", text)
 396
 397         def add_part(snippets, **fields):
 398             doc = self.create_book_doc(book)
 399             for n, v in book_fields.items():
 400                 doc[n] = v
 401
 402             doc['header_index'] = fields["header_index"]
 403             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 404             doc['header_type'] = fields['header_type']
 405
 406             doc['text'] = fields['text']
 407
 408             # snippets
 409             snip_pos = snippets.add(fields["text"])
 410
 411             doc['snippets_position'] = snip_pos[0]
 412             doc['snippets_length'] = snip_pos[1]
 413             if snippets.revision:
 414                 doc["snippets_revision"] = snippets.revision
 415
 416             if 'fragment_anchor' in fields:
 417                 doc["fragment_anchor"] = fields['fragment_anchor']
 418
 419             if 'themes' in fields:
 420                 doc['themes'] = fields['themes']
 421             doc['uid'] = "part%s%s%s" % (doc['header_index'],
 422                                          doc['header_span'],
 423                                          doc.get('fragment_anchor', ''))
 424             return doc
 425
 426         def give_me_utf8(s):
 427             if isinstance(s, unicode):
 428                 return s.encode('utf-8')
 429             else:
 430                 return s
 431
 432         fragments = {}
 433         snippets = Snippets(book.id).open('w')
 434         try:
 435             for header, position in zip(list(master), range(len(master))):
 436
 437                 if header.tag in self.skip_header_tags:
 438                     continue
 439                 if header.tag is etree.Comment:
 440                     continue
 441
 442                 # section content
 443                 content = []
 444                 footnote = []
 445
 446                 def all_content(text):
 447                     for frag in fragments.values():
 448                         frag['text'].append(text)
 449                     content.append(text)
 450                 handle_text = [all_content]
 451
 452                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 453                     # handle footnotes
 454                     if start is not None and start.tag in self.footnote_tags:
 455                         footnote = []
 456
 457                         def collect_footnote(t):
 458                             footnote.append(t)
 459
 460                         handle_text.append(collect_footnote)
 461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 462                         handle_text.pop()
 463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 464                                        text=u''.join(footnote),
 465                                        is_footnote=True)
 466                         self.index.add(doc)
 467                         #print "@ footnote text: %s" % footnote
 468                         footnote = []
 469
 470                     # handle fragments and themes.
 471                     if start is not None and start.tag == 'begin':
 472                         fid = start.attrib['id'][1:]
 473                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 474
 475                     # themes for this fragment
 476                     elif start is not None and start.tag == 'motyw':
 477                         fid = start.attrib['id'][1:]
 478                         handle_text.append(None)
 479                         if start.text is not None:
 480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 481                     elif end is not None and end.tag == 'motyw':
 482                         handle_text.pop()
 483
 484                     elif start is not None and start.tag == 'end':
 485                         fid = start.attrib['id'][1:]
 486                         if fid not in fragments:
 487                             continue  # a broken <end> node, skip it
 488                         frag = fragments[fid]
 489                         if frag['themes'] == []:
 490                             continue  # empty themes list.
 491                         del fragments[fid]
 492
 493                         doc = add_part(snippets,
 494                                        header_type=frag['start_header'],
 495                                        header_index=frag['start_section'],
 496                                        header_span=position - frag['start_section'] + 1,
 497                                        fragment_anchor=fid,
 498                                        text=fix_format(frag['text']),
 499                                        themes=frag['themes'])
 500                         #print '@ FRAG %s' % frag['content']
 501                         self.index.add(doc)
 502
 503                         # Collect content.
 504
 505                     if text is not None and handle_text is not []:
 506                         hdl = handle_text[-1]
 507                         if hdl is not None:
 508                             hdl(text)
 509
 510                         # in the end, add a section text.
 511                 doc = add_part(snippets, header_index=position,
 512                                header_type=header.tag, text=fix_format(content))
 513                 #print '@ CONTENT: %s' % fix_format(content)
 514
 515                 self.index.add(doc)
 516
 517         finally:
 518             snippets.close()
 519
 520
 521 class SearchResult(object):
 522     def __init__(self, doc, how_found=None, query=None, query_terms=None):
 523         #        self.search = search
 524         self.boost = 1.0
 525         self._hits = []
 526         self._processed_hits = None  # processed hits
 527         self.snippets = []
 528         self.query_terms = query_terms
 529
 530         if 'score' in doc:
 531             self._score = doc['score']
 532         else:
 533             self._score = 0
 534
 535         self.book_id = int(doc["book_id"])
 536
 537         try:
 538             self.published_date = int(doc.get("published_date"))
 539         except ValueError:
 540             self.published_date = 0
 541
 542         # content hits
 543         header_type = doc.get("header_type", None)
 544         # we have a content hit in some header of fragment
 545         if header_type is not None:
 546             sec = (header_type, int(doc["header_index"]))
 547             header_span = doc['header_span']
 548             header_span = header_span is not None and int(header_span) or 1
 549             fragment = doc.get("fragment_anchor", None)
 550             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 551             snippets_rev = doc['snippets_revision']
 552
 553             hit = (sec + (header_span,), fragment, self._score, {
 554                 'how_found': how_found,
 555                 'snippets_pos': snippets_pos,
 556                 'snippets_revision': snippets_rev,
 557                 'themes': doc.get('themes', []),
 558                 'themes_pl': doc.get('themes_pl', [])
 559                 })
 560
 561             self._hits.append(hit)
 562
 563     def __unicode__(self):
 564         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
 565             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 566
 567     def __str__(self):
 568         return unicode(self).encode('utf-8')
 569
 570     @property
 571     def score(self):
 572         return self._score * self.boost
 573
 574     def merge(self, other):
 575         if self.book_id != other.book_id:
 576             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 577         self._hits += other._hits
 578         if other.score > self.score:
 579             self._score = other._score
 580         return self
 581
 582     def get_book(self):
 583         if hasattr(self, '_book'):
 584             return self._book
 585         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 586         return self._book
 587
 588     book = property(get_book)
 589
 590     POSITION = 0
 591     FRAGMENT = 1
 592     POSITION_INDEX = 1
 593     POSITION_SPAN = 2
 594     SCORE = 2
 595     OTHER = 3
 596
 597     @property
 598     def hits(self):
 599         if self._processed_hits is not None:
 600             return self._processed_hits
 601
 602         # to sections and fragments
 603         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 604
 605         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 606
 607         # sections not covered by fragments
 608         sect = filter(lambda s: 0 == len(filter(
 609             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
 610             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
 611             frags)), sect)
 612
 613         hits = []
 614
 615         def remove_duplicates(lst, keyfn, compare):
 616             els = {}
 617             for e in lst:
 618                 eif = keyfn(e)
 619                 if eif in els:
 620                     if compare(els[eif], e) >= 1:
 621                         continue
 622                 els[eif] = e
 623             return els.values()
 624
 625         # remove fragments with duplicated fid's and duplicated snippets
 626         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 627         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 628         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 629
 630         # remove duplicate sections
 631         sections = {}
 632
 633         for s in sect:
 634             si = s[self.POSITION][self.POSITION_INDEX]
 635             # skip existing
 636             if si in sections:
 637                 if sections[si]['score'] >= s[self.SCORE]:
 638                     continue
 639
 640             m = {'score': s[self.SCORE],
 641                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 642                  }
 643             m.update(s[self.OTHER])
 644             sections[si] = m
 645
 646         hits = sections.values()
 647
 648         for f in frags:
 649             try:
 650                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 651             except catalogue.models.Fragment.DoesNotExist:
 652                 # stale index
 653                 continue
 654             # Figure out if we were searching for a token matching some word in theme name.
 655             themes = frag.tags.filter(category='theme')
 656             themes_hit = set()
 657             if self.query_terms is not None:
 658                 for i in range(0, len(f[self.OTHER]['themes'])):
 659                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 660                     tms = map(unicode.lower, tms)
 661                     for qt in self.query_terms:
 662                         if qt in tms:
 663                             themes_hit.add(f[self.OTHER]['themes'][i])
 664                             break
 665
 666             def theme_by_name(n):
 667                 th = filter(lambda t: t.name == n, themes)
 668                 if th:
 669                     return th[0]
 670                 else:
 671                     return None
 672             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 673
 674             m = {'score': f[self.SCORE],
 675                  'fragment': frag,
 676                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 677                  'themes': themes,
 678                  'themes_hit': themes_hit
 679                  }
 680             m.update(f[self.OTHER])
 681             hits.append(m)
 682
 683         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 684
 685         self._processed_hits = hits
 686
 687         return hits
 688
 689     @staticmethod
 690     def aggregate(*result_lists):
 691         books = {}
 692         for rl in result_lists:
 693             for r in rl:
 694                 if r.book_id in books:
 695                     books[r.book_id].merge(r)
 696                 else:
 697                     books[r.book_id] = r
 698         return books.values()
 699
 700     def __cmp__(self, other):
 701         c = cmp(self.score, other.score)
 702         if c == 0:
 703             # this is inverted, because earlier date is better
 704             return cmp(other.published_date, self.published_date)
 705         else:
 706             return c
 707
 708     def __len__(self):
 709         return len(self.hits)
 710
 711     def snippet_pos(self, idx=0):
 712         return self.hits[idx]['snippets_pos']
 713
 714     def snippet_revision(self, idx=0):
 715         try:
 716             return self.hits[idx]['snippets_revision']
 717         except:
 718             return None
 719
 720
 721 class Search(SolrIndex):
 722     """
 723     Search facilities.
 724     """
 725     def __init__(self, default_field="text"):
 726         super(Search, self).__init__(mode='r')
 727
 728     # def get_tokens(self, searched, field='text', cached=None):
 729     #     """returns tokens analyzed by a proper (for a field) analyzer
 730     #     argument can be: StringReader, string/unicode, or tokens. In the last case
 731     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 732     #     """
 733     #     if cached is not None and field in cached:
 734     #         return cached[field]
 735
 736     #     if isinstance(searched, str) or isinstance(searched, unicode):
 737     #         searched = StringReader(searched)
 738     #     elif isinstance(searched, list):
 739     #         return searched
 740
 741     #     searched.reset()
 742     #     tokens = self.analyzer.reusableTokenStream(field, searched)
 743     #     toks = []
 744     #     while tokens.incrementToken():
 745     #         cta = tokens.getAttribute(CharTermAttribute.class_)
 746     #         toks.append(cta.toString())
 747
 748     #     if cached is not None:
 749     #         cached[field] = toks
 750
 751     #     return toks
 752
 753     # @staticmethod
 754     # def fuzziness(fuzzy):
 755     #     """Helper method to sanitize fuzziness"""
 756     #     if not fuzzy:
 757     #         return None
 758     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 759     #         return fuzzy
 760     #     else:
 761     #         return 0.5
 762
 763     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
 764     #     """
 765     #     Return a PhraseQuery with a series of tokens.
 766     #     """
 767     #     if fuzzy:
 768     #         phrase = MultiPhraseQuery()
 769     #         for t in tokens:
 770     #             term = Term(field, t)
 771     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 772     #             fuzzterms = []
 773
 774     #             while True:
 775     #                 ft = fuzzterm.term()
 776     #                 if ft:
 777     #                     fuzzterms.append(ft)
 778     #                 if not fuzzterm.next(): break
 779     #             if fuzzterms:
 780     #                 phrase.add(JArray('object')(fuzzterms, Term))
 781     #             else:
 782     #                 phrase.add(term)
 783     #     else:
 784     #         phrase = PhraseQuery()
 785     #         phrase.setSlop(slop)
 786     #         for t in tokens:
 787     #             term = Term(field, t)
 788     #             phrase.add(term)
 789     #     return phrase
 790
 791     def make_term_query(self, query, field='text', modal=operator.or_):
 792         """
 793         Returns term queries joined by boolean query.
 794         modal - applies to boolean query
 795         fuzzy - should the query by fuzzy.
 796         """
 797         if query is None: query = ''
 798         q = self.index.Q()
 799         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
 800                         query.split(r" ")), q)
 801
 802         return q
 803
 804     def search_phrase(self, searched, field='text', book=False,
 805                       filters=None,
 806                       snippets=False):
 807         if filters is None: filters = []
 808         if book: filters.append(self.index.Q(is_book=True))
 809
 810         q = self.index.query(**{field: searched})
 811         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
 812         res = q.execute()
 813         return [SearchResult(found, how_found=u'search_phrase') for found in res]
 814
 815     def search_some(self, searched, fields, book=True,
 816                     filters=None, snippets=True, query_terms=None):
 817         assert isinstance(fields, list)
 818         if filters is None: filters = []
 819         if book: filters.append(self.index.Q(is_book=True))
 820
 821         query = self.index.Q()
 822
 823         for fld in fields:
 824             query = self.index.Q(query | self.make_term_query(searched, fld))
 825
 826         query = self.index.query(query)
 827         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 828         res = query.execute()
 829         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 830
 831     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 832     #     """
 833     #     Search for perfect book matches. Just see if the query matches with some author or title,
 834     #     taking hints into account.
 835     #     """
 836     #     fields_to_search = ['authors', 'title']
 837     #     only_in = None
 838     #     if hint:
 839     #         if not hint.should_search_for_book():
 840     #             return []
 841     #         fields_to_search = hint.just_search_in(fields_to_search)
 842     #         only_in = hint.book_filter()
 843
 844     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 845
 846     #     books = []
 847     #     for q in qrys:
 848     #         top = self.searcher.search(q,
 849     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 850     #             max_results)
 851     #         for found in top.scoreDocs:
 852     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
 853     #     return books
 854
 855     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 856     #     fields_to_search = ['tags', 'authors', 'title']
 857
 858     #     only_in = None
 859     #     if hint:
 860     #         if not hint.should_search_for_book():
 861     #             return []
 862     #         fields_to_search = hint.just_search_in(fields_to_search)
 863     #         only_in = hint.book_filter()
 864
 865     #     tokens = self.get_tokens(searched, field='SIMPLE')
 866
 867     #     q = BooleanQuery()
 868
 869     #     for fld in fields_to_search:
 870     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 871     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 872
 873     #     books = []
 874     #     top = self.searcher.search(q,
 875     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 876     #         max_results)
 877     #     for found in top.scoreDocs:
 878     #         books.append(SearchResult(self, found, how_found="search_book"))
 879
 880     #     return books
 881
 882     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 883     #     """
 884     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 885     #     some part/fragment of the book.
 886     #     """
 887     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
 888
 889     #     flt = None
 890     #     if hint:
 891     #         flt = hint.part_filter()
 892
 893     #     books = []
 894     #     for q in qrys:
 895     #         top = self.searcher.search(q,
 896     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 897     #                                                        flt]),
 898     #                                    max_results)
 899     #         for found in top.scoreDocs:
 900     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 901
 902     #     return books
 903
 904     def search_everywhere(self, searched, query_terms=None):
 905         """
 906         Tries to use search terms to match different fields of book (or its parts).
 907         E.g. one word can be an author survey, another be a part of the title, and the rest
 908         are some words from third chapter.
 909         """
 910         books = []
 911         # content only query : themes x content
 912         q = self.make_term_query(searched, 'text')
 913         q_themes = self.make_term_query(searched, 'themes_pl')
 914
 915         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
 916         res = query.execute()
 917
 918         for found in res:
 919             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
 920
 921         # query themes/content x author/title/tags
 922         in_content = self.index.Q()
 923         in_meta = self.index.Q()
 924
 925         for fld in ['themes_pl', 'text']:
 926             in_content |= self.make_term_query(searched, field=fld)
 927
 928         for fld in ['tags', 'authors', 'title']:
 929             in_meta |= self.make_term_query(searched, field=fld)
 930
 931         q = in_content & in_meta
 932         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
 933
 934         for found in res:
 935             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
 936
 937         return books
 938
 939     def get_snippets(self, searchresult, query, field='text', num=1):
 940         """
 941         Returns a snippet for found scoreDoc.
 942         """
 943         maxnum = len(searchresult)
 944         if num is None or num < 0 or num > maxnum:
 945             num = maxnum
 946         book_id = searchresult.book_id
 947         revision = searchresult.snippet_revision()
 948         snippets = Snippets(book_id, revision=revision)
 949         snips = [None] * maxnum
 950         try:
 951             snippets.open()
 952             idx = 0
 953             while idx < maxnum and num > 0:
 954                 position, length = searchresult.snippet_pos(idx)
 955                 if position is None or length is None:
 956                     continue
 957                 text = snippets.get((int(position),
 958                                      int(length)))
 959                 snip = self.index.highlight(text=text, field=field, q=query)
 960                 snips[idx] = snip
 961                 if snip:
 962                     num -= 1
 963                 idx += 1
 964
 965         except IOError, e:
 966             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
 967             return []
 968         finally:
 969             snippets.close()
 970
 971             # remove verse end markers..
 972         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 973
 974         searchresult.snippets = snips
 975
 976         return snips
 977
 978     def hint_tags(self, query, pdcounter=True, prefix=True):
 979         """
 980         Return auto-complete hints for tags
 981         using prefix search.
 982         """
 983         q = self.index.Q()
 984         query = query.strip()
 985         for field in ['tag_name', 'tag_name_pl']:
 986             if prefix:
 987                 q |= self.index.Q(**{field: query + "*"})
 988             else:
 989                 q |= self.make_term_query(query, field=field)
 990         qu = self.index.query(q).exclude(tag_category="book")
 991
 992         return self.search_tags(qu, pdcounter=pdcounter)
 993
 994     def search_tags(self, query, filters=None, pdcounter=False):
 995         """
 996         Search for Tag objects using query.
 997         """
 998         if not filters: filters = []
 999         if not pdcounter:
1000             filters.append(~self.index.Q(is_pdcounter=True))
1001         res = self.apply_filters(query, filters).execute()
1002
1003         tags = []
1004         for doc in res:
1005             is_pdcounter = doc.get('is_pdcounter', False)
1006             category = doc.get('tag_category')
1007             try:
1008                 if is_pdcounter == True:
1009                     if category == 'pd_author':
1010                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1011                     elif category == 'pd_book':
1012                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1013                         tag.category = 'pd_book'  # make it look more lik a tag.
1014                     else:
1015                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1016                 else:
1017                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1018                     # don't add the pdcounter tag if same tag already exists
1019
1020                 tags.append(tag)
1021
1022             except catalogue.models.Tag.DoesNotExist: pass
1023             except PDCounterAuthor.DoesNotExist: pass
1024             except PDCounterBook.DoesNotExist: pass
1025
1026         log.debug('search_tags: %s' % tags)
1027
1028         return tags
1029
1030     def hint_books(self, query, prefix=True):
1031         """
1032         Returns auto-complete hints for book titles
1033         Because we do not index 'pseudo' title-tags.
1034         Prefix search.
1035         """
1036         q = self.index.Q()
1037         query = query.strip()
1038         if prefix:
1039             q |= self.index.Q(title=query + "*")
1040         else:
1041             q |= self.make_term_query(query, field='title')
1042         qu = self.index.query(q)
1043         only_books = self.index.Q(is_book=True)
1044         return self.search_books(qu, [only_books])
1045
1046     def search_books(self, query, filters=None, max_results=10):
1047         """
1048         Searches for Book objects using query
1049         """
1050         bks = []
1051         res = self.apply_filters(query, filters).field_limit(['book_id'])
1052         for r in res:
1053             try:
1054                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1055             except catalogue.models.Book.DoesNotExist: pass
1056         return bks
1057
1058     # def make_prefix_phrase(self, toks, field):
1059     #     q = MultiPhraseQuery()
1060     #     for i in range(len(toks)):
1061     #         t = Term(field, toks[i])
1062     #         if i == len(toks) - 1:
1063     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1064     #             if pterms:
1065     #                 q.add(pterms)
1066     #             else:
1067     #                 q.add(t)
1068     #         else:
1069     #             q.add(t)
1070     #     return q
1071
1072     # @staticmethod
1073     # def term_filter(term, inverse=False):
1074     #     only_term = TermsFilter()
1075     #     only_term.addTerm(term)
1076
1077     #     if inverse:
1078     #         neg = BooleanFilter()
1079     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1080     #         only_term = neg
1081
1082     #     return only_term
1083
1084
1085
1086     @staticmethod
1087     def apply_filters(query, filters):
1088         """
1089         Apply filters to a query
1090         """
1091         if filters is None: filters = []
1092         filters = filter(lambda x: x is not None, filters)
1093         for f in filters:
1094             query = query.query(f)
1095         return query
1096
1097     # def filtered_categories(self, tags):
1098     #     """
1099     #     Return a list of tag categories, present in tags list.
1100     #     """
1101     #     cats = {}
1102     #     for t in tags:
1103     #         cats[t.category] = True
1104     #     return cats.keys()
1105
1106     # def hint(self):
1107     #     return Hint(self)