1 # -*- coding: utf-8 -*-
 
   3 from django.conf import settings
 
   4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
 
   5     File, Field, Integer, \
 
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
 
   7     QueryParser, PerFieldAnalyzerWrapper, \
 
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
 
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
 
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
 
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
 
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
 
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
 
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
 
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
 
  16     initVM, CLASSPATH, JArray, JavaError
 
  20 JVM = initVM(CLASSPATH)
 
  26 from librarian import dcparser
 
  27 from librarian.parser import WLDocument
 
  28 from lxml import etree
 
  29 import catalogue.models
 
  30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 
  31 from multiprocessing.pool import ThreadPool
 
  32 from threading import current_thread
 
  37 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
  39         polish = PolishAnalyzer(Version.LUCENE_34)
 
  40         #        polish_gap.setPositionIncrementGap(999)
 
  42         simple = SimpleAnalyzer(Version.LUCENE_34)
 
  43         #        simple_gap.setPositionIncrementGap(999)
 
  45         keyword = KeywordAnalyzer(Version.LUCENE_34)
 
  47         # not sure if needed: there's NOT_ANALYZED meaning basically the same
 
  49         PerFieldAnalyzerWrapper.__init__(self, polish)
 
  51         self.addAnalyzer("tags", simple)
 
  52         self.addAnalyzer("technical_editors", simple)
 
  53         self.addAnalyzer("editors", simple)
 
  54         self.addAnalyzer("url", keyword)
 
  55         self.addAnalyzer("source_url", keyword)
 
  56         self.addAnalyzer("source_name", simple)
 
  57         self.addAnalyzer("publisher", simple)
 
  58         self.addAnalyzer("authors", simple)
 
  59         self.addAnalyzer("title", simple)
 
  61         self.addAnalyzer("is_book", keyword)
 
  62         # shouldn't the title have two forms? _pl and simple?
 
  64         self.addAnalyzer("themes", simple)
 
  65         self.addAnalyzer("themes_pl", polish)
 
  67         self.addAnalyzer("tag_name", simple)
 
  68         self.addAnalyzer("tag_name_pl", polish)
 
  70         self.addAnalyzer("translators", simple)
 
  72         self.addAnalyzer("KEYWORD", keyword)
 
  73         self.addAnalyzer("SIMPLE", simple)
 
  74         self.addAnalyzer("POLISH", polish)
 
  77 class IndexStore(object):
 
  79     Provides access to search index.
 
  81     self.store - lucene index directory
 
  85         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
 
  87     def make_index_dir(self):
 
  89             os.makedirs(settings.SEARCH_INDEX)
 
  90         except OSError as exc:
 
  91             if exc.errno == errno.EEXIST:
 
  96 class IndexChecker(IndexStore):
 
  98         IndexStore.__init__(self)
 
 101         checker = CheckIndex(self.store)
 
 102         status = checker.checkIndex()
 
 106 class Snippets(object):
 
 108     This class manages snippet files for indexed object (book)
 
 109     the snippets are concatenated together, and their positions and
 
 110     lengths are kept in lucene index fields.
 
 112     SNIPPET_DIR = "snippets"
 
 114     def __init__(self, book_id):
 
 116             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 
 117         except OSError as exc:
 
 118             if exc.errno == errno.EEXIST:
 
 121         self.book_id = book_id
 
 124     def open(self, mode='r'):
 
 126         Open the snippet file. Call .close() afterwards.
 
 130         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 
 134     def add(self, snippet):
 
 136         Append a snippet (unicode) to the snippet file.
 
 137         Return a (position, length) tuple
 
 139         txt = snippet.encode('utf-8')
 
 142         pos = (self.position, l)
 
 148         Given a tuple of (position, length) return an unicode
 
 149         of the snippet stored there.
 
 151         self.file.seek(pos[0], 0)
 
 152         txt = self.file.read(pos[1]).decode('utf-8')
 
 156         """Close snippet file"""
 
 160 class BaseIndex(IndexStore):
 
 163     Provides basic operations on index: opening, closing, optimizing.
 
 165     def __init__(self, analyzer=None):
 
 166         super(BaseIndex, self).__init__()
 
 169             analyzer = WLAnalyzer()
 
 170         self.analyzer = analyzer
 
 172     def open(self, timeout=None):
 
 174             raise Exception("Index is already opened")
 
 175         conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
 
 177             conf.setWriteLockTimeout(long(timeout))
 
 178         self.index = IndexWriter(self.store, conf)
 
 182         self.index.optimize()
 
 186             self.index.optimize()
 
 187         except JavaError, je:
 
 188             print "Error during optimize phase, check index: %s" % je
 
 197     def __exit__(self, type, value, tb):
 
 201 class Index(BaseIndex):
 
 203     Class indexing books.
 
 205     def __init__(self, analyzer=None):
 
 206         super(Index, self).__init__(analyzer)
 
 208     def index_tags(self):
 
 210         Re-index global tag list.
 
 211         Removes all tags from index, then index them again.
 
 212         Indexed fields include: id, name (with and without polish stems), category
 
 214         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 
 215         self.index.deleteDocuments(q)
 
 217         for tag in catalogue.models.Tag.objects.all():
 
 219             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
 
 220             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 221             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 222             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 223             self.index.addDocument(doc)
 
 225         for pdtag in PDCounterAuthor.objects.all():
 
 227             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 
 228             doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 229             doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 230             doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 231             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 232             self.index.addDocument(doc)
 
 234         for pdtag in PDCounterBook.objects.all():
 
 236             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
 
 237             doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 
 238             doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
 
 239             doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 240             doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 241             self.index.addDocument(doc)
 
 243     def create_book_doc(self, book):
 
 245         Create a lucene document referring book id.
 
 248         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
 
 249         if book.parent is not None:
 
 250             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
 
 253     def remove_book(self, book):
 
 254         """Removes a book from search index.
 
 255         book - Book instance."""
 
 256         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 
 257         self.index.deleteDocuments(q)
 
 259     def index_book(self, book, book_info=None, overwrite=True):
 
 262         Creates a lucene document for extracted metadata
 
 263         and calls self.index_content() to index the contents of the book.
 
 266             self.remove_book(book)
 
 268         book_doc = self.create_book_doc(book)
 
 269         meta_fields = self.extract_metadata(book, book_info)
 
 270         for f in meta_fields.values():
 
 271             if isinstance(f, list) or isinstance(f, tuple):
 
 276         self.index.addDocument(book_doc)
 
 279         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 
 284         'dramat_wierszowany_l',
 
 285         'dramat_wierszowany_lp',
 
 286         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 290     ignore_content_tags = [
 
 292         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 
 294         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 
 297     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
 299     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
 301     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 303     def extract_metadata(self, book, book_info=None):
 
 305         Extract metadata from book and returns a map of fields keyed by fieldname
 
 309         if book_info is None:
 
 310             book_info = dcparser.parse(open(book.xml_file.path))
 
 312         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 
 313         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 
 314         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 317         for field in dcparser.BookInfo.FIELDS:
 
 318             if hasattr(book_info, field.name):
 
 319                 if not getattr(book_info, field.name):
 
 321                 # since no type information is available, we use validator
 
 322                 type_indicator = field.validator
 
 323                 if type_indicator == dcparser.as_unicode:
 
 324                     s = getattr(book_info, field.name)
 
 328                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 
 329                     except JavaError as je:
 
 330                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 
 331                 elif type_indicator == dcparser.as_person:
 
 332                     p = getattr(book_info, field.name)
 
 333                     if isinstance(p, dcparser.Person):
 
 336                         persons = ', '.join(map(unicode, p))
 
 337                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 
 338                 elif type_indicator == dcparser.as_date:
 
 339                     dt = getattr(book_info, field.name)
 
 340                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 
 341                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 345         if hasattr(book_info, 'source_name') and book_info.source_name:
 
 346             match = self.published_date_re.search(book_info.source_name)
 
 347             if match is not None:
 
 348                 pd = str(match.groups()[0])
 
 350         fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
 
 354     def add_gaps(self, fields, fieldname):
 
 356         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 
 357         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 
 361                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 362         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
 364     def get_master(self, root):
 
 366         Returns the first master tag from an etree.
 
 368         for master in root.iter():
 
 369             if master.tag in self.master_tags:
 
 372     def index_content(self, book, book_fields=[]):
 
 374         Walks the book XML and extract content from it.
 
 375         Adds parts for each header tag and for each fragment.
 
 377         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 
 378         root = wld.edoc.getroot()
 
 380         master = self.get_master(root)
 
 384         def walker(node, ignore_tags=[]):
 
 386             if node.tag not in ignore_tags:
 
 387                 yield node, None, None
 
 388                 if node.text is not None:
 
 389                     yield None, node.text, None
 
 390                 for child in list(node):
 
 391                     for b, t, e in walker(child):
 
 393                 yield None, None, node
 
 395             if node.tail is not None:
 
 396                 yield None, node.tail, None
 
 399         def fix_format(text):
 
 400             #            separator = [u" ", u"\t", u".", u";", u","]
 
 401             if isinstance(text, list):
 
 402                 # need to join it first
 
 403                 text = filter(lambda s: s is not None, content)
 
 404                 text = u' '.join(text)
 
 405                 # for i in range(len(text)):
 
 407                 #         if text[i][0] not in separator\
 
 408                 #             and text[i - 1][-1] not in separator:
 
 409                 #          text.insert(i, u" ")
 
 411             return re.sub("(?m)/$", "", text)
 
 413         def add_part(snippets, **fields):
 
 414             doc = self.create_book_doc(book)
 
 415             for f in book_fields:
 
 418             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 
 419             doc.add(NumericField("header_span", Field.Store.YES, True)\
 
 420                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 
 421             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 423             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 
 424                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 
 426             snip_pos = snippets.add(fields["content"])
 
 427             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 
 428             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 
 430             if 'fragment_anchor' in fields:
 
 431                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 
 432                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 434             if 'themes' in fields:
 
 435                 themes, themes_pl = zip(*[
 
 436                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 
 437                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 
 438                      for theme in fields['themes']])
 
 440                 themes = self.add_gaps(themes, 'themes')
 
 441                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 
 451             if isinstance(s, unicode):
 
 452                 return s.encode('utf-8')
 
 457         snippets = Snippets(book.id).open('w')
 
 459             for header, position in zip(list(master), range(len(master))):
 
 461                 if header.tag in self.skip_header_tags:
 
 463                 if header.tag is etree.Comment:
 
 470                 def all_content(text):
 
 471                     for frag in fragments.values():
 
 472                         frag['content'].append(text)
 
 474                 handle_text = [all_content]
 
 477                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
 
 479                     if start is not None and start.tag in self.footnote_tags:
 
 481                         def collect_footnote(t):
 
 483                         handle_text.append(collect_footnote)
 
 484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 
 486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 487                                        content=u''.join(footnote),
 
 488                                        is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 490                         self.index.addDocument(doc)
 
 491                         #print "@ footnote text: %s" % footnote
 
 494                     # handle fragments and themes.
 
 495                     if start is not None and start.tag == 'begin':
 
 496                         fid = start.attrib['id'][1:]
 
 497                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
 499                     # themes for this fragment
 
 500                     elif start is not None and start.tag == 'motyw':
 
 501                         fid = start.attrib['id'][1:]
 
 502                         handle_text.append(None)
 
 503                         if start.text is not None:
 
 504                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 
 505                     elif end is not None and end.tag == 'motyw':
 
 508                     elif start is not None and start.tag == 'end':
 
 509                         fid = start.attrib['id'][1:]
 
 510                         if fid not in fragments:
 
 511                             continue  # a broken <end> node, skip it
 
 512                         frag = fragments[fid]
 
 513                         if frag['themes'] == []:
 
 514                             continue  # empty themes list.
 
 517                         doc = add_part(snippets,
 
 518                                        header_type=frag['start_header'],
 
 519                                        header_index=frag['start_section'],
 
 520                                        header_span=position - frag['start_section'] + 1,
 
 522                                        content=fix_format(frag['content']),
 
 523                                        themes=frag['themes'])
 
 524                         #print '@ FRAG %s' % frag['content']
 
 525                         self.index.addDocument(doc)
 
 529                     if text is not None and handle_text is not []:
 
 530                         hdl = handle_text[-1]
 
 534                         # in the end, add a section text.
 
 535                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 536                                content=fix_format(content))
 
 537                 #print '@ CONTENT: %s' % fix_format(content)
 
 539                 self.index.addDocument(doc)
 
 545 def log_exception_wrapper(f):
 
 550             print("Error in indexing thread: %s" % e)
 
 551             traceback.print_exc()
 
 556 class ReusableIndex(Index):
 
 558     Works like index, but does not close/optimize Lucene index
 
 559     until program exit (uses atexit hook).
 
 560     This is usefull for importbooks command.
 
 562     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 
 566     def open(self, analyzer=None, **kw):
 
 567         if ReusableIndex.index:
 
 568             self.index = ReusableIndex.index
 
 570             print("opening index")
 
 571             Index.open(self, analyzer, **kw)
 
 572             ReusableIndex.index = self.index
 
 573             atexit.register(ReusableIndex.close_reusable)
 
 575     # def index_book(self, *args, **kw):
 
 576     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 
 577     #     ReusableIndex.pool_jobs.append(job)
 
 580     def close_reusable():
 
 581         if ReusableIndex.index:
 
 582             print("closing index")
 
 583             ReusableIndex.index.optimize()
 
 584             ReusableIndex.index.close()
 
 585             ReusableIndex.index = None
 
 588         if ReusableIndex.index:
 
 589             ReusableIndex.index.commit()
 
 592 class JoinSearch(object):
 
 594     This mixin could be used to handle block join queries.
 
 597     def __init__(self, *args, **kw):
 
 598         super(JoinSearch, self).__init__(*args, **kw)
 
 600     def wrapjoins(self, query, fields=[]):
 
 602         This functions modifies the query in a recursive way,
 
 603         so Term and Phrase Queries contained, which match
 
 604         provided fields are wrapped in a BlockJoinQuery,
 
 605         and so delegated to children documents.
 
 607         if BooleanQuery.instance_(query):
 
 608             qs = BooleanQuery.cast_(query)
 
 610                 clause = BooleanClause.cast_(clause)
 
 611                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 
 615             query.extractTerms(termset)
 
 618                 if t.field() not in fields:
 
 620             return BlockJoinQuery(query, self.parent_filter,
 
 621                                   BlockJoinQuery.ScoreMode.Total)
 
 623     def bsearch(self, query, max_results=50):
 
 624         q = self.query(query)
 
 625         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 
 627         tops = self.searcher.search(bjq, max_results)
 
 629         for found in tops.scoreDocs:
 
 630             doc = self.searcher.doc(found.doc)
 
 631             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 632         return (bks, tops.totalHits)
 
 635 class SearchResult(object):
 
 636     def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
 
 637         if tokens_cache is None: tokens_cache = {}
 
 642             self._score = scoreDocs.score
 
 647         self._processed_hits = None  # processed hits
 
 649         stored = search.searcher.doc(scoreDocs.doc)
 
 650         self.book_id = int(stored.get("book_id"))
 
 652         pd = stored.get("published_date")
 
 655         self.published_date = int(pd)
 
 657         header_type = stored.get("header_type")
 
 658         # we have a content hit in some header of fragment
 
 659         if header_type is not None:
 
 660             sec = (header_type, int(stored.get("header_index")))
 
 661             header_span = stored.get('header_span')
 
 662             header_span = header_span is not None and int(header_span) or 1
 
 664             fragment = stored.get("fragment_anchor")
 
 667                 snippets = snippets.replace("/\n", "\n")
 
 668             hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
 670             self._hits.append(hit)
 
 673         self.searched = searched
 
 674         self.tokens_cache = tokens_cache
 
 678         return self._score * self.boost
 
 680     def merge(self, other):
 
 681         if self.book_id != other.book_id:
 
 682             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
 683         self._hits += other._hits
 
 684         if other.score > self.score:
 
 685             self._score = other._score
 
 689         return catalogue.models.Book.objects.get(id=self.book_id)
 
 691     book = property(get_book)
 
 695         if self._processed_hits is not None:
 
 696             return self._processed_hits
 
 705         # to sections and fragments
 
 706         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 
 707         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 
 708         sect = filter(lambda s: 0 == len(filter(
 
 709             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 
 710             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 
 715         # remove duplicate fragments
 
 720                 if fragments[fid][SCORE] >= f[SCORE]:
 
 723         frags = fragments.values()
 
 725         # remove duplicate sections
 
 729             si = s[POSITION][POSITION_INDEX]
 
 732                 if sections[si]['score'] >= s[SCORE]:
 
 735             m = {'score': s[SCORE],
 
 736                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 
 741         hits = sections.values()
 
 745                 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
 
 746             except catalogue.models.Fragment.DoesNotExist:
 
 750             # Figure out if we were searching for a token matching some word in theme name.
 
 751             themes = frag.tags.filter(category='theme')
 
 753             if self.searched is not None:
 
 754                 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
 
 756                     name_tokens = self.search.get_tokens(theme.name, 'POLISH')
 
 759                             if not theme in themes_hit:
 
 760                                 themes_hit.append(theme)
 
 763             m = {'score': f[SCORE],
 
 765                  'section_number': f[POSITION][POSITION_INDEX] + 1,
 
 767                  'themes_hit': themes_hit
 
 772         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
 774         self._processed_hits = hits
 
 778     def __unicode__(self):
 
 779         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 
 782     def aggregate(*result_lists):
 
 784         for rl in result_lists:
 
 786                 if r.book_id in books:
 
 787                     books[r.book_id].merge(r)
 
 788                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 
 791         return books.values()
 
 793     def __cmp__(self, other):
 
 794         c = cmp(self.score, other.score)
 
 796             # this is inverted, because earlier date is better
 
 797             return cmp(other.published_date, self.published_date)
 
 804     Given some hint information (information we already know about)
 
 805     our search target - like author, title (specific book), epoch, genre, kind
 
 806     we can narrow down search using filters.
 
 808     def __init__(self, search):
 
 810         Accepts a Searcher instance.
 
 817     def books(self, *books):
 
 819         Give a hint that we search these books.
 
 823     def tags(self, tags):
 
 825         Give a hint that these Tag objects (a list of)
 
 829             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 
 830                 lst = self.book_tags.get(t.category, [])
 
 832                 self.book_tags[t.category] = lst
 
 833             if t.category in ['theme', 'theme_pl']:
 
 834                 self.part_tags.append(t)
 
 836     def tag_filter(self, tags, field='tags'):
 
 838         Given a lsit of tags and an optional field (but they are normally in tags field)
 
 839         returns a filter accepting only books with specific tags.
 
 844             toks = self.search.get_tokens(tag.name, field=field)
 
 845             tag_phrase = PhraseQuery()
 
 847                 tag_phrase.add(Term(field, tok))
 
 848             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 
 850         return QueryWrapperFilter(q)
 
 852     def book_filter(self):
 
 854         Filters using book tags (all tag kinds except a theme)
 
 856         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 
 858             return self.tag_filter(tags)
 
 862     def part_filter(self):
 
 864         This filter can be used to look for book parts.
 
 865         It filters on book id and/or themes.
 
 869             fs.append(self.tag_filter(self.part_tags, field='themes'))
 
 871         if self._books != []:
 
 873             for b in self._books:
 
 874                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 
 875                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 
 878         return Search.chain_filters(fs)
 
 880     def should_search_for_book(self):
 
 881         return self._books == []
 
 883     def just_search_in(self, all):
 
 884         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 
 887             if field == 'authors' and 'author' in self.book_tags:
 
 889             if field == 'title' and self._books != []:
 
 891             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 
 897 class Search(IndexStore):
 
 901     def __init__(self, default_field="content"):
 
 902         IndexStore.__init__(self)
 
 903         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 
 904         # self.analyzer = WLAnalyzer()
 
 905         self.searcher = IndexSearcher(self.store, True)
 
 906         self.parser = QueryParser(Version.LUCENE_34, default_field,
 
 909         self.parent_filter = TermsFilter()
 
 910         self.parent_filter.addTerm(Term("is_book", "true"))
 
 912     def query(self, query):
 
 913         """Parse query in default Lucene Syntax. (for humans)
 
 915         return self.parser.parse(query)
 
 917     def simple_search(self, query, max_results=50):
 
 918         """Runs a query for books using lucene syntax. (for humans)
 
 919         Returns (books, total_hits)
 
 922         tops = self.searcher.search(self.query(query), max_results)
 
 924         for found in tops.scoreDocs:
 
 925             doc = self.searcher.doc(found.doc)
 
 926             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 927         return (bks, tops.totalHits)
 
 929     def get_tokens(self, searched, field='content', cached=None):
 
 930         """returns tokens analyzed by a proper (for a field) analyzer
 
 931         argument can be: StringReader, string/unicode, or tokens. In the last case
 
 932         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 
 934         if cached is not None and field in cached:
 
 937         if isinstance(searched, str) or isinstance(searched, unicode):
 
 938             searched = StringReader(searched)
 
 939         elif isinstance(searched, list):
 
 943         tokens = self.analyzer.reusableTokenStream(field, searched)
 
 945         while tokens.incrementToken():
 
 946             cta = tokens.getAttribute(CharTermAttribute.class_)
 
 947             toks.append(cta.toString())
 
 949         if cached is not None:
 
 954     def fuzziness(self, fuzzy):
 
 955         """Helper method to sanitize fuzziness"""
 
 958         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 
 963     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 
 965         Return a PhraseQuery with a series of tokens.
 
 968             phrase = MultiPhraseQuery()
 
 970                 term = Term(field, t)
 
 971                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 
 975                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 
 979                     if not fuzzterm.next(): break
 
 981                     phrase.add(JArray('object')(fuzzterms, Term))
 
 985             phrase = PhraseQuery()
 
 988                 term = Term(field, t)
 
 992     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 
 994         Returns term queries joined by boolean query.
 
 995         modal - applies to boolean query
 
 996         fuzzy - should the query by fuzzy.
 
1000             term = Term(field, t)
 
1002                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 
1004                 term = TermQuery(term)
 
1005             q.add(BooleanClause(term, modal))
 
1008     def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
 
1009                       filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
 
1010         if filters is None: filters = []
 
1011         if tokens_cache is None: tokens_cache = {}
 
1013         tokens = self.get_tokens(searched, field, cached=tokens_cache)
 
1015         query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
 
1017             filters.append(self.term_filter(Term('is_book', 'true')))
 
1018         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1020         return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 
1022     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
 
1023                     filters=None, tokens_cache=None, boost=None, snippets=True):
 
1024         if filters is None: filters = []
 
1025         if tokens_cache is None: tokens_cache = {}
 
1028             filters.append(self.term_filter(Term('is_book', 'true')))
 
1030         query = BooleanQuery()
 
1033             tokens = self.get_tokens(searched, fld, cached=tokens_cache)
 
1035             query.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1036                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1038         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
1040         return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
 
1041                              snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
 
1043     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1045         Search for perfect book matches. Just see if the query matches with some author or title,
 
1046         taking hints into account.
 
1048         fields_to_search = ['authors', 'title']
 
1051             if not hint.should_search_for_book():
 
1053             fields_to_search = hint.just_search_in(fields_to_search)
 
1054             only_in = hint.book_filter()
 
1056         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 
1060             top = self.searcher.search(q,
 
1061                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1063             for found in top.scoreDocs:
 
1064                 books.append(SearchResult(self, found, how_found="search_perfect_book"))
 
1067     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
1068         fields_to_search = ['tags', 'authors', 'title']
 
1072             if not hint.should_search_for_book():
 
1074             fields_to_search = hint.just_search_in(fields_to_search)
 
1075             only_in = hint.book_filter()
 
1077         tokens = self.get_tokens(searched, field='SIMPLE')
 
1081         for fld in fields_to_search:
 
1082             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
1083                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1086         top = self.searcher.search(q,
 
1087                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
1089         for found in top.scoreDocs:
 
1090             books.append(SearchResult(self, found, how_found="search_book"))
 
1094     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 
1096         Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
 
1097         some part/fragment of the book.
 
1099         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 
1103             flt = hint.part_filter()
 
1107             top = self.searcher.search(q,
 
1108                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 
1111             for found in top.scoreDocs:
 
1112                 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 
1116     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
 
1118         Tries to use search terms to match different fields of book (or its parts).
 
1119         E.g. one word can be an author survey, another be a part of the title, and the rest
 
1120         are some words from third chapter.
 
1122         if tokens_cache is None: tokens_cache = {}
 
1127             only_in = hint.part_filter()
 
1129         # content only query : themes x content
 
1132         tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
 
1133         tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
 
1135         # only search in themes when we do not already filter by themes
 
1136         if hint is None or hint.just_search_in(['themes']) != []:
 
1137             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 
1138                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
1140         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 
1141                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
1143         topDocs = self.searcher.search(q, only_in, max_results)
 
1144         for found in topDocs.scoreDocs:
 
1145             books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
 
1146             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
1148         # query themes/content x author/title/tags
 
1150         in_content = BooleanQuery()
 
1151         in_meta = BooleanQuery()
 
1153         for fld in ['themes_pl', 'content']:
 
1154             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1156         for fld in ['tags', 'authors', 'title']:
 
1157             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
1159         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 
1160         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 
1162         topDocs = self.searcher.search(q, only_in, max_results)
 
1163         for found in topDocs.scoreDocs:
 
1164             books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
 
1165             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
1169     # def multisearch(self, query, max_results=50):
 
1172     #     - (phrase) OR -> content
 
1175     #     - (keywords)  -> authors
 
1180         # queryreader = StringReader(query)
 
1181         # tokens = self.get_tokens(queryreader)
 
1183         # top_level = BooleanQuery()
 
1184         # Should = BooleanClause.Occur.SHOULD
 
1186         # phrase_level = BooleanQuery()
 
1187         # phrase_level.setBoost(1.3)
 
1189         # p_content = self.make_phrase(tokens, joined=True)
 
1190         # p_title = self.make_phrase(tokens, 'title')
 
1191         # p_author = self.make_phrase(tokens, 'author')
 
1193         # phrase_level.add(BooleanClause(p_content, Should))
 
1194         # phrase_level.add(BooleanClause(p_title, Should))
 
1195         # phrase_level.add(BooleanClause(p_author, Should))
 
1197         # kw_level = BooleanQuery()
 
1199         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 
1200         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 
1201         # kw_level.add(j_themes, Should)
 
1202         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 
1203         # j_con = self.make_term_query(tokens, joined=True)
 
1204         # kw_level.add(j_con, Should)
 
1206         # top_level.add(BooleanClause(phrase_level, Should))
 
1207         # top_level.add(BooleanClause(kw_level, Should))
 
1211     def get_snippets(self, scoreDoc, query, field='content'):
 
1213         Returns a snippet for found scoreDoc.
 
1215         htmlFormatter = SimpleHTMLFormatter()
 
1216         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 
1218         stored = self.searcher.doc(scoreDoc.doc)
 
1220         position = stored.get('snippets_position')
 
1221         length = stored.get('snippets_length')
 
1222         if position is None or length is None:
 
1225         book_id = int(stored.get('book_id'))
 
1226         snippets = Snippets(book_id).open()
 
1229                 text = snippets.get((int(position),
 
1234             tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 
1235             #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 
1236             snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
1238         except Exception, e:
 
1240             if hasattr(e, 'getJavaException'):
 
1241                 e2 = unicode(e.getJavaException())
 
1242             raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
 
1247     def enum_to_array(enum):
 
1249         Converts a lucene TermEnum to array of Terms, suitable for
 
1258             if not enum.next(): break
 
1261             return JArray('object')(terms, Term)
 
1263     def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
 
1265         Search for Tag objects using query.
 
1268             filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
 
1269         tops = self.searcher.search(query, filters, max_results)
 
1272         for found in tops.scoreDocs:
 
1273             doc = self.searcher.doc(found.doc)
 
1274             is_pdcounter = doc.get('is_pdcounter')
 
1275             category = doc.get('tag_category')
 
1276             if is_pdcounter == 'true':
 
1277                 if category == 'pd_author':
 
1278                     tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
 
1279                 elif category == 'pd_book':
 
1280                     tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
 
1281                     tag.category = 'pd_book'  # make it look more lik a tag.
 
1283                     print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
 
1285                 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 
1286                 # don't add the pdcounter tag if same tag already exists
 
1287             if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
 
1289                 #            print "%s (%d) -> %f" % (tag, tag.id, found.score)
 
1290         print 'returning %s' % tags
 
1293     def search_books(self, query, filter=None, max_results=10):
 
1295         Searches for Book objects using query
 
1298         tops = self.searcher.search(query, filter, max_results)
 
1299         for found in tops.scoreDocs:
 
1300             doc = self.searcher.doc(found.doc)
 
1301             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
1304     def make_prefix_phrase(self, toks, field):
 
1305         q = MultiPhraseQuery()
 
1306         for i in range(len(toks)):
 
1307             t = Term(field, toks[i])
 
1308             if i == len(toks) - 1:
 
1309                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
 
1319     def term_filter(term, inverse=False):
 
1320         only_term = TermsFilter()
 
1321         only_term.addTerm(term)
 
1324             neg = BooleanFilter()
 
1325             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
 
1330     def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
 
1332         Return auto-complete hints for tags
 
1333         using prefix search.
 
1335         toks = self.get_tokens(string, field='SIMPLE')
 
1336         top = BooleanQuery()
 
1338         for field in ['tag_name', 'tag_name_pl']:
 
1340                 q = self.make_prefix_phrase(toks, field)
 
1342                 q = self.make_term_query(toks, field, fuzzy=fuzzy)
 
1343             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 
1345         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 
1347         return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
 
1349     def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
 
1351         Returns auto-complete hints for book titles
 
1352         Because we do not index 'pseudo' title-tags.
 
1355         toks = self.get_tokens(string, field='SIMPLE')
 
1358             q = self.make_prefix_phrase(toks, 'title')
 
1360             q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
 
1362         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 
1365     def chain_filters(filters, op=ChainedFilter.AND):
 
1367         Chains a filter list together
 
1369         filters = filter(lambda x: x is not None, filters)
 
1370         if not filters or filters is []:
 
1372         chf = ChainedFilter(JArray('object')(filters, Filter), op)
 
1375     def filtered_categories(self, tags):
 
1377         Return a list of tag categories, present in tags list.
 
1381             cats[t.category] = True