1 # -*- coding: utf-8 -*-
 
   3 from django.conf import settings
 
   4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
 
   5     File, Field, Integer, \
 
   6     NumericField, Version, Document, JavaError, IndexSearcher, \
 
   7     QueryParser, PerFieldAnalyzerWrapper, \
 
   8     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
 
   9     KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
 
  10     BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
 
  11     HashSet, BooleanClause, Term, CharTermAttribute, \
 
  12     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
 
  13     FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
 
  14     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
 
  15     BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
 
  16     initVM, CLASSPATH, JArray, JavaError
 
  20 JVM = initVM(CLASSPATH)
 
  26 from librarian import dcparser
 
  27 from librarian.parser import WLDocument
 
  28 import catalogue.models
 
  29 from multiprocessing.pool import ThreadPool
 
  30 from threading import current_thread
 
  35 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
  37         polish = PolishAnalyzer(Version.LUCENE_34)
 
  38         #        polish_gap.setPositionIncrementGap(999)
 
  40         simple = SimpleAnalyzer(Version.LUCENE_34)
 
  41         #        simple_gap.setPositionIncrementGap(999)
 
  43         keyword = KeywordAnalyzer(Version.LUCENE_34)
 
  45         # not sure if needed: there's NOT_ANALYZED meaning basically the same
 
  47         PerFieldAnalyzerWrapper.__init__(self, polish)
 
  49         self.addAnalyzer("tags", simple)
 
  50         self.addAnalyzer("technical_editors", simple)
 
  51         self.addAnalyzer("editors", simple)
 
  52         self.addAnalyzer("url", keyword)
 
  53         self.addAnalyzer("source_url", keyword)
 
  54         self.addAnalyzer("source_name", simple)
 
  55         self.addAnalyzer("publisher", simple)
 
  56         self.addAnalyzer("authors", simple)
 
  57         self.addAnalyzer("title", simple)
 
  59         self.addAnalyzer("is_book", keyword)
 
  60         # shouldn't the title have two forms? _pl and simple?
 
  62         self.addAnalyzer("themes", simple)
 
  63         self.addAnalyzer("themes_pl", polish)
 
  65         self.addAnalyzer("tag_name", simple)
 
  66         self.addAnalyzer("tag_name_pl", polish)
 
  68         self.addAnalyzer("translators", simple)
 
  70         self.addAnalyzer("KEYWORD", keyword)
 
  71         self.addAnalyzer("SIMPLE", simple)
 
  72         self.addAnalyzer("POLISH", polish)
 
  75 class IndexStore(object):
 
  77     Provides access to search index.
 
  79     self.store - lucene index directory
 
  83         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
 
  85     def make_index_dir(self):
 
  87             os.makedirs(settings.SEARCH_INDEX)
 
  88         except OSError as exc:
 
  89             if exc.errno == errno.EEXIST:
 
  94 class IndexChecker(IndexStore):
 
  96         IndexStore.__init__(self)
 
  99         checker = CheckIndex(self.store)
 
 100         status = checker.checkIndex()
 
 104 class Snippets(object):
 
 106     This class manages snippet files for indexed object (book)
 
 107     the snippets are concatenated together, and their positions and
 
 108     lengths are kept in lucene index fields.
 
 110     SNIPPET_DIR = "snippets"
 
 112     def __init__(self, book_id):
 
 114             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 
 115         except OSError as exc:
 
 116             if exc.errno == errno.EEXIST:
 
 119         self.book_id = book_id
 
 122     def open(self, mode='r'):
 
 124         Open the snippet file. Call .close() afterwards.
 
 128         self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
 
 132     def add(self, snippet):
 
 134         Append a snippet (unicode) to the snippet file.
 
 135         Return a (position, length) tuple
 
 137         txt = snippet.encode('utf-8')
 
 140         pos = (self.position, l)
 
 146         Given a tuple of (position, length) return an unicode
 
 147         of the snippet stored there.
 
 149         self.file.seek(pos[0], 0)
 
 150         txt = self.file.read(pos[1]).decode('utf-8')
 
 154         """Close snippet file"""
 
 158 class BaseIndex(IndexStore):
 
 161     Provides basic operations on index: opening, closing, optimizing.
 
 163     def __init__(self, analyzer=None):
 
 164         super(BaseIndex, self).__init__()
 
 167             analyzer = WLAnalyzer()
 
 168         self.analyzer = analyzer
 
 170     def open(self, analyzer=None):
 
 172             raise Exception("Index is already opened")
 
 173         self.index = IndexWriter(self.store, self.analyzer,\
 
 174                                  IndexWriter.MaxFieldLength.LIMITED)
 
 178         self.index.optimize()
 
 182             self.index.optimize()
 
 183         except JavaError, je:
 
 184             print "Error during optimize phase, check index: %s" % je
 
 193     def __exit__(self, type, value, tb):
 
 197 class Index(BaseIndex):
 
 199     Class indexing books.
 
 201     def __init__(self, analyzer=None):
 
 202         super(Index, self).__init__(analyzer)
 
 204     def index_tags(self):
 
 206         Re-index global tag list.
 
 207         Removes all tags from index, then index them again.
 
 208         Indexed fields include: id, name (with and without polish stems), category
 
 210         q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
 
 211         self.index.deleteDocuments(q)
 
 213         for tag in catalogue.models.Tag.objects.all():
 
 215             doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
 
 216             doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 217             doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
 
 218             doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 219             self.index.addDocument(doc)
 
 221     def create_book_doc(self, book):
 
 223         Create a lucene document referring book id.
 
 226         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 
 227         if book.parent is not None:
 
 228             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 
 231     def remove_book(self, book):
 
 232         """Removes a book from search index.
 
 233         book - Book instance."""
 
 234         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
 
 235         self.index.deleteDocuments(q)
 
 237     def index_book(self, book, book_info=None, overwrite=True):
 
 240         Creates a lucene document for extracted metadata
 
 241         and calls self.index_content() to index the contents of the book.
 
 244             self.remove_book(book)
 
 246         book_doc = self.create_book_doc(book)
 
 247         meta_fields = self.extract_metadata(book, book_info)
 
 248         for f in meta_fields.values():
 
 249             if isinstance(f, list) or isinstance(f, tuple):
 
 255         self.index.addDocument(book_doc)
 
 258         self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
 
 263         'dramat_wierszowany_l',
 
 264         'dramat_wierszowany_lp',
 
 265         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 269     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
 271     def extract_metadata(self, book, book_info=None):
 
 273         Extract metadata from book and returns a map of fields keyed by fieldname
 
 277         if book_info is None:
 
 278             book_info = dcparser.parse(open(book.xml_file.path))
 
 280         fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
 
 281         fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
 
 282         fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 285         for field in dcparser.BookInfo.FIELDS:
 
 286             if hasattr(book_info, field.name):
 
 287                 if not getattr(book_info, field.name):
 
 289                 # since no type information is available, we use validator
 
 290                 type_indicator = field.validator
 
 291                 if type_indicator == dcparser.as_unicode:
 
 292                     s = getattr(book_info, field.name)
 
 296                         fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
 
 297                     except JavaError as je:
 
 298                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 
 299                 elif type_indicator == dcparser.as_person:
 
 300                     p = getattr(book_info, field.name)
 
 301                     if isinstance(p, dcparser.Person):
 
 304                         persons = ', '.join(map(unicode, p))
 
 305                     fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
 
 306                 elif type_indicator == dcparser.as_date:
 
 307                     dt = getattr(book_info, field.name)
 
 308                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
 
 309                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 313     def add_gaps(self, fields, fieldname):
 
 315         Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 
 316         This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 
 320                 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 321         return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
 323     def get_master(self, root):
 
 325         Returns the first master tag from an etree.
 
 327         for master in root.iter():
 
 328             if master.tag in self.master_tags:
 
 331     def index_content(self, book, book_fields=[]):
 
 333         Walks the book XML and extract content from it.
 
 334         Adds parts for each header tag and for each fragment.
 
 336         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 
 337         root = wld.edoc.getroot()
 
 339         master = self.get_master(root)
 
 345             for child in list(node):
 
 346                 for b, e in walker(child):
 
 351         def fix_format(text):
 
 352             return re.sub("(?m)/$", "", text)
 
 354         def add_part(snippets, **fields):
 
 355             doc = self.create_book_doc(book)
 
 356             for f in book_fields:
 
 359             doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
 
 360             doc.add(NumericField("header_span", Field.Store.YES, True)\
 
 361                     .setIntValue('header_span' in fields and fields['header_span'] or 1))
 
 362             doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 364             doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
 
 365                           Field.TermVector.WITH_POSITIONS_OFFSETS))
 
 367             snip_pos = snippets.add(fields["content"])
 
 368             doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
 
 369             doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
 
 371             if 'fragment_anchor' in fields:
 
 372                 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
 
 373                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 375             if 'themes' in fields:
 
 376                 themes, themes_pl = zip(*[
 
 377                     (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
 
 378                      Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
 
 379                      for theme in fields['themes']])
 
 381                 themes = self.add_gaps(themes, 'themes')
 
 382                 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
 
 392             if isinstance(s, unicode):
 
 393                 return s.encode('utf-8')
 
 398         snippets = Snippets(book.id).open('w')
 
 400             for header, position in zip(list(master), range(len(master))):
 
 402                 if header.tag in self.skip_header_tags:
 
 405                 content = u' '.join([t for t in header.itertext()])
 
 406                 content = fix_format(content)
 
 408                 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
 
 410                 self.index.addDocument(doc)
 
 412                 for start, end in walker(header):
 
 413                     if start is not None and start.tag == 'begin':
 
 414                         fid = start.attrib['id'][1:]
 
 415                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
 416                         fragments[fid]['content'].append(start.tail)
 
 417                     elif start is not None and start.tag == 'motyw':
 
 418                         fid = start.attrib['id'][1:]
 
 419                         if start.text is not None:
 
 420                             fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
 
 421                         fragments[fid]['content'].append(start.tail)
 
 422                     elif start is not None and start.tag == 'end':
 
 423                         fid = start.attrib['id'][1:]
 
 424                         if fid not in fragments:
 
 425                             continue  # a broken <end> node, skip it
 
 426                         frag = fragments[fid]
 
 427                         if frag['themes'] == []:
 
 428                             continue  # empty themes list.
 
 432                             return u' '.join(map(
 
 433                                 lambda x: x == None and u'(none)' or unicode(x),
 
 436                         doc = add_part(snippets,
 
 437                                        header_type=frag['start_header'],
 
 438                                        header_index=frag['start_section'],
 
 439                                        header_span=position - frag['start_section'] + 1,
 
 441                                        content=u' '.join(filter(lambda s: s is not None, frag['content'])),
 
 442                                        themes=frag['themes'])
 
 444                         self.index.addDocument(doc)
 
 445                     elif start is not None:
 
 446                         for frag in fragments.values():
 
 447                             frag['content'].append(start.text)
 
 448                     elif end is not None:
 
 449                         for frag in fragments.values():
 
 450                             frag['content'].append(end.tail)
 
 455 def log_exception_wrapper(f):
 
 460             print("Error in indexing thread: %s" % e)
 
 461             traceback.print_exc()
 
 466 class ReusableIndex(Index):
 
 468     Works like index, but does not close/optimize Lucene index
 
 469     until program exit (uses atexit hook).
 
 470     This is usefull for importbooks command.
 
 472     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 
 476     def open(self, analyzer=None, threads=4):
 
 477         if ReusableIndex.index is not None:
 
 478             self.index = ReusableIndex.index
 
 480             print("opening index")
 
 481             Index.open(self, analyzer)
 
 482             ReusableIndex.index = self.index
 
 483             atexit.register(ReusableIndex.close_reusable)
 
 485     # def index_book(self, *args, **kw):
 
 486     #     job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
 
 487     #     ReusableIndex.pool_jobs.append(job)
 
 490     def close_reusable():
 
 491         if ReusableIndex.index is not None:
 
 492             ReusableIndex.index.optimize()
 
 493             ReusableIndex.index.close()
 
 494             ReusableIndex.index = None
 
 500 class JoinSearch(object):
 
 502     This mixin could be used to handle block join queries.
 
 505     def __init__(self, *args, **kw):
 
 506         super(JoinSearch, self).__init__(*args, **kw)
 
 508     def wrapjoins(self, query, fields=[]):
 
 510         This functions modifies the query in a recursive way,
 
 511         so Term and Phrase Queries contained, which match
 
 512         provided fields are wrapped in a BlockJoinQuery,
 
 513         and so delegated to children documents.
 
 515         if BooleanQuery.instance_(query):
 
 516             qs = BooleanQuery.cast_(query)
 
 518                 clause = BooleanClause.cast_(clause)
 
 519                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 
 523             query.extractTerms(termset)
 
 526                 if t.field() not in fields:
 
 528             return BlockJoinQuery(query, self.parent_filter,
 
 529                                   BlockJoinQuery.ScoreMode.Total)
 
 531     def bsearch(self, query, max_results=50):
 
 532         q = self.query(query)
 
 533         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 
 535         tops = self.searcher.search(bjq, max_results)
 
 537         for found in tops.scoreDocs:
 
 538             doc = self.searcher.doc(found.doc)
 
 539             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 540         return (bks, tops.totalHits)
 
 543 class SearchResult(object):
 
 544     def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
 
 548             self.score = scoreDocs.score
 
 551         self.hits = None  # processed hits
 
 553         stored = searcher.doc(scoreDocs.doc)
 
 554         self.book_id = int(stored.get("book_id"))
 
 556         header_type = stored.get("header_type")
 
 560         sec = (header_type, int(stored.get("header_index")))
 
 561         header_span = stored.get('header_span')
 
 562         header_span = header_span is not None and int(header_span) or 1
 
 564         fragment = stored.get("fragment_anchor")
 
 566         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
 568         self._hits.append(hit)
 
 570     def merge(self, other):
 
 571         if self.book_id != other.book_id:
 
 572             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
 573         self._hits += other._hits
 
 574         if other.score > self.score:
 
 575             self.score = other.score
 
 579         return catalogue.models.Book.objects.get(id=self.book_id)
 
 581     book = property(get_book)
 
 583     def process_hits(self):
 
 591         # to sections and fragments
 
 592         frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
 
 593         sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
 
 594         sect = filter(lambda s: 0 == len(filter(
 
 595             lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
 
 596             and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
 
 601         # remove duplicate fragments
 
 606                 if fragments[fid][SCORE] >= f[SCORE]:
 
 609         frags = fragments.values()
 
 611         # remove duplicate sections
 
 615             si = s[POSITION][POSITION_INDEX]
 
 618                 if sections[si]['score'] >= s[SCORE]:
 
 621             m = {'score': s[SCORE],
 
 622                  'section_number': s[POSITION][POSITION_INDEX] + 1,
 
 627         hits = sections.values()
 
 630             frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
 
 631             m = {'score': f[SCORE],
 
 633                  'themes': frag.tags.filter(category='theme')
 
 638         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
 
 644     def __unicode__(self):
 
 645         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
 
 648     def aggregate(*result_lists):
 
 650         for rl in result_lists:
 
 652                 if r.book_id in books:
 
 653                     books[r.book_id].merge(r)
 
 654                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
 
 657         return books.values()
 
 659     def __cmp__(self, other):
 
 660         return cmp(self.score, other.score)
 
 665     Given some hint information (information we already know about)
 
 666     our search target - like author, title (specific book), epoch, genre, kind
 
 667     we can narrow down search using filters.
 
 669     def __init__(self, search):
 
 671         Accepts a Searcher instance.
 
 678     def books(self, *books):
 
 680         Give a hint that we search these books.
 
 684     def tags(self, tags):
 
 686         Give a hint that these Tag objects (a list of)
 
 690             if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
 
 691                 lst = self.book_tags.get(t.category, [])
 
 693                 self.book_tags[t.category] = lst
 
 694             if t.category in ['theme', 'theme_pl']:
 
 695                 self.part_tags.append(t)
 
 697     def tag_filter(self, tags, field='tags'):
 
 699         Given a lsit of tags and an optional field (but they are normally in tags field)
 
 700         returns a filter accepting only books with specific tags.
 
 705             toks = self.search.get_tokens(tag.name, field=field)
 
 706             tag_phrase = PhraseQuery()
 
 708                 tag_phrase.add(Term(field, tok))
 
 709             q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
 
 711         return QueryWrapperFilter(q)
 
 713     def book_filter(self):
 
 715         Filters using book tags (all tag kinds except a theme)
 
 717         tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
 
 719             return self.tag_filter(tags)
 
 723     def part_filter(self):
 
 725         This filter can be used to look for book parts.
 
 726         It filters on book id and/or themes.
 
 730             fs.append(self.tag_filter(self.part_tags, field='themes'))
 
 732         if self._books != []:
 
 734             for b in self._books:
 
 735                 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
 
 736                 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
 
 739         return Search.chain_filters(fs)
 
 741     def should_search_for_book(self):
 
 742         return self._books == []
 
 744     def just_search_in(self, all):
 
 745         """Holds logic to figure out which indexes should be search, when we have some hinst already"""
 
 748             if field == 'authors' and 'author' in self.book_tags:
 
 750             if field == 'title' and self._books != []:
 
 752             if (field == 'themes' or field == 'themes_pl') and self.part_tags:
 
 758 class Search(IndexStore):
 
 762     def __init__(self, default_field="content"):
 
 763         IndexStore.__init__(self)
 
 764         self.analyzer = WLAnalyzer()  # PolishAnalyzer(Version.LUCENE_34)
 
 765         # self.analyzer = WLAnalyzer()
 
 766         self.searcher = IndexSearcher(self.store, True)
 
 767         self.parser = QueryParser(Version.LUCENE_34, default_field,
 
 770         self.parent_filter = TermsFilter()
 
 771         self.parent_filter.addTerm(Term("is_book", "true"))
 
 773     def query(self, query):
 
 774         """Parse query in default Lucene Syntax. (for humans)
 
 776         return self.parser.parse(query)
 
 778     def simple_search(self, query, max_results=50):
 
 779         """Runs a query for books using lucene syntax. (for humans)
 
 780         Returns (books, total_hits)
 
 783         tops = self.searcher.search(self.query(query), max_results)
 
 785         for found in tops.scoreDocs:
 
 786             doc = self.searcher.doc(found.doc)
 
 787             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 788         return (bks, tops.totalHits)
 
 790     def get_tokens(self, searched, field='content'):
 
 791         """returns tokens analyzed by a proper (for a field) analyzer
 
 792         argument can be: StringReader, string/unicode, or tokens. In the last case
 
 793         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
 
 795         if isinstance(searched, str) or isinstance(searched, unicode):
 
 796             searched = StringReader(searched)
 
 797         elif isinstance(searched, list):
 
 801         tokens = self.analyzer.reusableTokenStream(field, searched)
 
 803         while tokens.incrementToken():
 
 804             cta = tokens.getAttribute(CharTermAttribute.class_)
 
 805             toks.append(cta.toString())
 
 808     def fuzziness(self, fuzzy):
 
 809         """Helper method to sanitize fuzziness"""
 
 812         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
 
 817     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
 
 819         Return a PhraseQuery with a series of tokens.
 
 822             phrase = MultiPhraseQuery()
 
 824                 term = Term(field, t)
 
 825                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
 
 829                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
 
 833                     if not fuzzterm.next(): break
 
 835                     phrase.add(JArray('object')(fuzzterms, Term))
 
 839             phrase = PhraseQuery()
 
 842                 term = Term(field, t)
 
 846     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
 
 848         Returns term queries joined by boolean query.
 
 849         modal - applies to boolean query
 
 850         fuzzy - should the query by fuzzy.
 
 854             term = Term(field, t)
 
 856                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
 
 858                 term = TermQuery(term)
 
 859             q.add(BooleanClause(term, modal))
 
 862     # def content_query(self, query):
 
 863     #     return BlockJoinQuery(query, self.parent_filter,
 
 864     #                           BlockJoinQuery.ScoreMode.Total)
 
 866     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
 868         Search for perfect book matches. Just see if the query matches with some author or title,
 
 869         taking hints into account.
 
 871         fields_to_search = ['authors', 'title']
 
 874             if not hint.should_search_for_book():
 
 876             fields_to_search = hint.just_search_in(fields_to_search)
 
 877             only_in = hint.book_filter()
 
 879         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
 
 883             top = self.searcher.search(q,
 
 884                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
 886             for found in top.scoreDocs:
 
 887                 books.append(SearchResult(self.searcher, found))
 
 890     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
 
 891         fields_to_search = ['tags', 'authors', 'title']
 
 895             if not hint.should_search_for_book():
 
 897             fields_to_search = hint.just_search_in(fields_to_search)
 
 898             only_in = hint.book_filter()
 
 900         tokens = self.get_tokens(searched, field='SIMPLE')
 
 904         for fld in fields_to_search:
 
 905             q.add(BooleanClause(self.make_term_query(tokens, field=fld,
 
 906                                 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
 909         top = self.searcher.search(q,
 
 910                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
 
 912         for found in top.scoreDocs:
 
 913             books.append(SearchResult(self.searcher, found))
 
 917     def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
 
 919         Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
 
 920         some part/fragment of the book.
 
 922         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
 
 926             flt = hint.part_filter()
 
 930             top = self.searcher.search(q,
 
 931                                        self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
 
 934             for found in top.scoreDocs:
 
 935                 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
 
 939     def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
 
 941         Tries to use search terms to match different fields of book (or its parts).
 
 942         E.g. one word can be an author survey, another be a part of the title, and the rest
 
 943         are some words from third chapter.
 
 949             only_in = hint.part_filter()
 
 951         # content only query : themes x content
 
 954         tokens_pl = self.get_tokens(searched, field='content')
 
 955         tokens = self.get_tokens(searched, field='SIMPLE')
 
 957         # only search in themes when we do not already filter by themes
 
 958         if hint is None or hint.just_search_in(['themes']) != []:
 
 959             q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
 
 960                                                      fuzzy=fuzzy), BooleanClause.Occur.MUST))
 
 962         q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
 
 963                                                  fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
 
 965         topDocs = self.searcher.search(q, only_in, max_results)
 
 966         for found in topDocs.scoreDocs:
 
 967             books.append(SearchResult(self.searcher, found))
 
 968             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
 970         # query themes/content x author/title/tags
 
 972         in_content = BooleanQuery()
 
 973         in_meta = BooleanQuery()
 
 975         for fld in ['themes_pl', 'content']:
 
 976             in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
 978         for fld in ['tags', 'authors', 'title']:
 
 979             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
 
 981         q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
 
 982         q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
 
 984         topDocs = self.searcher.search(q, only_in, max_results)
 
 985         for found in topDocs.scoreDocs:
 
 986             books.append(SearchResult(self.searcher, found))
 
 987             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
 991     # def multisearch(self, query, max_results=50):
 
 994     #     - (phrase) OR -> content
 
 997     #     - (keywords)  -> authors
 
1002         # queryreader = StringReader(query)
 
1003         # tokens = self.get_tokens(queryreader)
 
1005         # top_level = BooleanQuery()
 
1006         # Should = BooleanClause.Occur.SHOULD
 
1008         # phrase_level = BooleanQuery()
 
1009         # phrase_level.setBoost(1.3)
 
1011         # p_content = self.make_phrase(tokens, joined=True)
 
1012         # p_title = self.make_phrase(tokens, 'title')
 
1013         # p_author = self.make_phrase(tokens, 'author')
 
1015         # phrase_level.add(BooleanClause(p_content, Should))
 
1016         # phrase_level.add(BooleanClause(p_title, Should))
 
1017         # phrase_level.add(BooleanClause(p_author, Should))
 
1019         # kw_level = BooleanQuery()
 
1021         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
 
1022         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
 
1023         # kw_level.add(j_themes, Should)
 
1024         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 
1025         # j_con = self.make_term_query(tokens, joined=True)
 
1026         # kw_level.add(j_con, Should)
 
1028         # top_level.add(BooleanClause(phrase_level, Should))
 
1029         # top_level.add(BooleanClause(kw_level, Should))
 
1033     def get_snippets(self, scoreDoc, query, field='content'):
 
1035         Returns a snippet for found scoreDoc.
 
1037         htmlFormatter = SimpleHTMLFormatter()
 
1038         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
 
1040         stored = self.searcher.doc(scoreDoc.doc)
 
1043         snippets = Snippets(stored.get('book_id')).open()
 
1045             text = snippets.get((int(stored.get('snippets_position')),
 
1046                                  int(stored.get('snippets_length'))))
 
1050         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
 
1051         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
 
1052         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
 
1057     def enum_to_array(enum):
 
1059         Converts a lucene TermEnum to array of Terms, suitable for
 
1068             if not enum.next(): break
 
1071             return JArray('object')(terms, Term)
 
1073     def search_tags(self, query, filter=None, max_results=40):
 
1075         Search for Tag objects using query.
 
1077         tops = self.searcher.search(query, filter, max_results)
 
1080         for found in tops.scoreDocs:
 
1081             doc = self.searcher.doc(found.doc)
 
1082             tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
 
1084             print "%s (%d) -> %f" % (tag, tag.id, found.score)
 
1088     def search_books(self, query, filter=None, max_results=10):
 
1090         Searches for Book objects using query
 
1093         tops = self.searcher.search(query, filter, max_results)
 
1094         for found in tops.scoreDocs:
 
1095             doc = self.searcher.doc(found.doc)
 
1096             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
1099     def create_prefix_phrase(self, toks, field):
 
1100         q = MultiPhraseQuery()
 
1101         for i in range(len(toks)):
 
1102             t = Term(field, toks[i])
 
1103             if i == len(toks) - 1:
 
1104                 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
 
1114     def term_filter(term, inverse=False):
 
1115         only_term = TermsFilter()
 
1116         only_term.addTerm(term)
 
1119             neg = BooleanFilter()
 
1120             neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
 
1125     def hint_tags(self, string, max_results=50):
 
1127         Return auto-complete hints for tags
 
1128         using prefix search.
 
1130         toks = self.get_tokens(string, field='SIMPLE')
 
1131         top = BooleanQuery()
 
1133         for field in ['tag_name', 'tag_name_pl']:
 
1134             q = self.create_prefix_phrase(toks, field)
 
1135             top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
 
1137         no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
 
1139         return self.search_tags(top, no_book_cat, max_results=max_results)
 
1141     def hint_books(self, string, max_results=50):
 
1143         Returns auto-complete hints for book titles
 
1144         Because we do not index 'pseudo' title-tags.
 
1147         toks = self.get_tokens(string, field='SIMPLE')
 
1149         q = self.create_prefix_phrase(toks, 'title')
 
1151         return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
 
1154     def chain_filters(filters, op=ChainedFilter.AND):
 
1156         Chains a filter list together
 
1158         filters = filter(lambda x: x is not None, filters)
 
1161         chf = ChainedFilter(JArray('object')(filters, Filter), op)
 
1164     def filtered_categories(self, tags):
 
1166         Return a list of tag categories, present in tags list.
 
1170             cats[t.category] = True