1 # -*- coding: utf-8 -*-
 
   2 from django.conf import settings
 
   3 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
 
   4     NumericField, Version, Document, JavaError, IndexSearcher, \
 
   5     QueryParser, Term, PerFieldAnalyzerWrapper, \
 
   6     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
 
   7     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
 
   8     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
 
   9     HashSet, BooleanClause, Term, CharTermAttribute, \
 
  10     PhraseQuery, StringReader
 
  14 from librarian import dcparser
 
  15 from librarian.parser import WLDocument
 
  16 import catalogue.models
 
  20 class WLAnalyzer(PerFieldAnalyzerWrapper):
 
  22         polish = PolishAnalyzer(Version.LUCENE_34)
 
  23         simple = SimpleAnalyzer(Version.LUCENE_34)
 
  24         keyword = KeywordAnalyzer(Version.LUCENE_34)
 
  25         # not sure if needed: there's NOT_ANALYZED meaning basically the same
 
  27         PerFieldAnalyzerWrapper.__init__(self, polish)
 
  29         self.addAnalyzer("tags", simple)
 
  30         self.addAnalyzer("technical_editors", simple)
 
  31         self.addAnalyzer("editors", simple)
 
  32         self.addAnalyzer("url", keyword)
 
  33         self.addAnalyzer("source_url", keyword)
 
  34         self.addAnalyzer("source_name", simple)
 
  35         self.addAnalyzer("publisher", simple)
 
  36         self.addAnalyzer("author", simple)
 
  37         self.addAnalyzer("is_book", keyword)
 
  39         #self.addanalyzer("fragment_anchor", keyword)
 
  42 class IndexStore(object):
 
  45         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
 
  47     def make_index_dir(self):
 
  49             os.makedirs(settings.SEARCH_INDEX)
 
  50         except OSError as exc:
 
  51             if exc.errno == errno.EEXIST:
 
  56 class Index(IndexStore):
 
  57     def __init__(self, analyzer=None):
 
  58         IndexStore.__init__(self)
 
  61             analyzer = WLAnalyzer()
 
  62         self.analyzer = analyzer
 
  64     def open(self, analyzer=None):
 
  66             raise Exception("Index is already opened")
 
  67         self.index = IndexWriter(self.store, self.analyzer,\
 
  68                                  IndexWriter.MaxFieldLength.LIMITED)
 
  76     def remove_book(self, book):
 
  77         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True,True)
 
  78         self.index.deleteDocuments(q)
 
  80     def index_book(self, book, overwrite=True):
 
  82             self.remove_book(book)
 
  84         doc = self.extract_metadata(book)
 
  85         parts = self.extract_content(book)
 
  86         block = ArrayList().of_(Document)
 
  91         self.index.addDocuments(block)
 
  96         'dramat_wierszowany_l',
 
  97         'dramat_wierszowany_lp',
 
  98         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 102     skip_header_tags = ['autor_utworu', 'nazwa_utworu']
 
 104     def create_book_doc(self, book):
 
 106         Create a lucene document connected to the book
 
 109         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
 
 110         if book.parent is not None:
 
 111             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
 
 114     def extract_metadata(self, book):
 
 115         book_info = dcparser.parse(book.xml_file)
 
 117         doc = self.create_book_doc(book)
 
 118         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
 
 119         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
 
 120         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 123         for field in dcparser.BookInfo.FIELDS:
 
 124             if hasattr(book_info, field.name):
 
 125                 if not getattr(book_info, field.name):
 
 127                 # since no type information is available, we use validator
 
 128                 type_indicator = field.validator
 
 129                 if type_indicator == dcparser.as_unicode:
 
 130                     s = getattr(book_info, field.name)
 
 134                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
 
 135                     except JavaError as je:
 
 136                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
 
 137                 elif type_indicator == dcparser.as_person:
 
 138                     p = getattr(book_info, field.name)
 
 139                     if isinstance(p, dcparser.Person):
 
 142                         persons = ', '.join(map(unicode, p))
 
 143                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
 
 144                 elif type_indicator == dcparser.as_date:
 
 145                     dt = getattr(book_info, field.name)
 
 146                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
 
 149     def get_master(self, root):
 
 150         for master in root.iter():
 
 151             if master.tag in self.master_tags:
 
 155     def extract_content(self, book):
 
 156         wld = WLDocument.from_file(book.xml_file.path)
 
 157         root = wld.edoc.getroot()
 
 159         # first we build a sequence of top-level items.
 
 161         # header_index - the 0-indexed position of header element.
 
 163         master = self.get_master(root)
 
 168         for header, position in zip(list(master), range(len(master))):
 
 169             if header.tag in self.skip_header_tags:
 
 171             doc = self.create_book_doc(book)
 
 172             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
 
 173             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 174             content = u' '.join([t for t in header.itertext()])
 
 175             doc.add(Field("content", content, Field.Store.NO, Field.Index.ANALYZED))
 
 176             header_docs.append(doc)
 
 180             for child in list(node):
 
 181                 for b, e in walker(child):
 
 186         # Then we create a document for each fragments
 
 187         # fragment_anchor - the anchor
 
 188         # themes - list of themes [not indexed]
 
 190         # will contain (framgent id -> { content: [], themes: [] }
 
 192         for start, end in walker(master):
 
 193             if start is not None and start.tag == 'begin':
 
 194                 fid = start.attrib['id'][1:]
 
 195                 fragments[fid] = {'content': [], 'themes': []}
 
 196                 fragments[fid]['content'].append(start.tail)
 
 197             elif start is not None and start.tag == 'motyw':
 
 198                 fid = start.attrib['id'][1:]
 
 199                 fragments[fid]['themes'].append(start.text)
 
 200                 fragments[fid]['content'].append(start.tail)
 
 201             elif start is not None and start.tag == 'end':
 
 202                 fid = start.attrib['id'][1:]
 
 203                 if fid not in fragments:
 
 204                     continue  # a broken <end> node, skip it
 
 205                 frag = fragments[fid]
 
 209                     return u' '.join(map(
 
 210                         lambda x: x == None and u'(none)' or unicode(x),
 
 213                 doc = self.create_book_doc(book)
 
 214                 doc.add(Field("fragment_anchor", fid,
 
 215                               Field.Store.YES, Field.Index.NOT_ANALYZED))
 
 216                 doc.add(Field("content",
 
 217                               u' '.join(filter(lambda s: s is not None, frag['content'])),
 
 218                               Field.Store.NO, Field.Index.ANALYZED))
 
 219                 doc.add(Field("themes",
 
 220                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
 
 221                               Field.Store.NO, Field.Index.ANALYZED))
 
 223                 fragment_docs.append(doc)
 
 224             elif start is not None:
 
 225                 for frag in fragments.values():
 
 226                     frag['content'].append(start.text)
 
 227             elif end is not None:
 
 228                 for frag in fragments.values():
 
 229                     frag['content'].append(end.tail)
 
 231         return header_docs + fragment_docs
 
 237     def __exit__(self, type, value, tb):
 
 241 class ReusableIndex(Index):
 
 243     Works like index, but does not close/optimize Lucene index
 
 244     until program exit (uses atexit hook).
 
 245     This is usefull for importbooks command.
 
 247     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
 
 250     def open(self, analyzer=None):
 
 251         if ReusableIndex.index is not None:
 
 252             self.index = ReusableIndex.index
 
 254             Index.open(self,analyzer)
 
 255             ReusableIndex.index = self.index
 
 256             atexit.register(ReusableIndex.close_reusable)
 
 259     def close_reusable():
 
 260         if ReusableIndex.index is not None:
 
 261             ReusableIndex.index.optimize()
 
 262             ReusableIndex.index.close()
 
 263             ReusableIndex.index = None
 
 268 class Search(IndexStore):
 
 269     def __init__(self, default_field="content"):
 
 270         IndexStore.__init__(self)
 
 271         self.analyzer = PolishAnalyzer(Version.LUCENE_34)
 
 272         ## self.analyzer = WLAnalyzer()
 
 273         self.searcher = IndexSearcher(self.store, True)
 
 274         self.parser = QueryParser(Version.LUCENE_34, default_field,
 
 277         self.parent_filter = TermsFilter()
 
 278         self.parent_filter.addTerm(Term("is_book", "true"))
 
 280     def query(self, query):
 
 281         return self.parser.parse(query)
 
 283     def wrapjoins(self, query, fields=[]):
 
 285         This functions modifies the query in a recursive way,
 
 286         so Term and Phrase Queries contained, which match
 
 287         provided fields are wrapped in a BlockJoinQuery,
 
 288         and so delegated to children documents.
 
 290         if BooleanQuery.instance_(query):
 
 291             qs = BooleanQuery.cast_(query)
 
 293                 clause = BooleanClause.cast_(clause)
 
 294                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
 
 298             query.extractTerms(termset)
 
 301                 if t.field() not in fields:
 
 303             return BlockJoinQuery(query, self.parent_filter,
 
 304                                   BlockJoinQuery.ScoreMode.Total)
 
 306     def simple_search(self, query, max_results=50):
 
 307         """Returns (books, total_hits)
 
 310         tops = self.searcher.search(self.query(query), max_results)
 
 312         for found in tops.scoreDocs:
 
 313             doc = self.searcher.doc(found.doc)
 
 314             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 315         return (bks, tops.totalHits)
 
 317     def search(self, query, max_results=50):
 
 318         query = self.query(query)
 
 319         query = self.wrapjoins(query, ["content", "themes"])
 
 321         tops = self.searcher.search(query, max_results)
 
 323         for found in tops.scoreDocs:
 
 324             doc = self.searcher.doc(found.doc)
 
 325             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 326         return (bks, tops.totalHits)
 
 328     def bsearch(self, query, max_results=50):
 
 329         q = self.query(query)
 
 330         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
 
 332         tops = self.searcher.search(bjq, max_results)
 
 334         for found in tops.scoreDocs:
 
 335             doc = self.searcher.doc(found.doc)
 
 336             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 337         return (bks, tops.totalHits)
 
 339 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
 
 340 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
 
 341 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
 
 343 # while (tokenStream.incrementToken()) {
 
 344 #     int startOffset = offsetAttribute.startOffset();
 
 345 #     int endOffset = offsetAttribute.endOffset();
 
 346 #     String term = charTermAttribute.toString();
 
 350 class MultiSearch(Search):
 
 351     """Class capable of IMDb-like searching"""
 
 352     def get_tokens(self, queryreader):
 
 353         if isinstance(queryreader, str):
 
 354             queryreader = StringReader(queryreader)
 
 356         tokens = self.analyzer.reusableTokenStream('content', queryreader)
 
 358         while tokens.incrementToken():
 
 359             cta = tokens.getAttribute(CharTermAttribute.class_)
 
 363     def make_phrase(self, tokens, field='content', joined=False):
 
 364         phrase = PhraseQuery()
 
 366             term = Term(field, t)
 
 369             phrase = self.content_query(phrase)
 
 372     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, joined=False):
 
 375             term = Term(field, t)
 
 376             q.add(BooleanClause(term, modal))
 
 378             self.content_query(q)
 
 381     def content_query(self, query):
 
 382         return BlockJoinQuery(query, self.parent_filter,
 
 383                               BlockJoinQuery.ScoreMode.Total)
 
 385     def multiseach(self, query, max_results=50):
 
 388         - (phrase) OR -> content
 
 391         - (keywords)  -> author
 
 396         queryreader = StringReader(query)
 
 397         tokens = self.get_tokens(queryreader)
 
 399         top_level = BooleanQuery()
 
 400         Should = BooleanClause.Occur.SHOULD
 
 402         phrase_level = BooleanQuery()
 
 404         p_content = self.make_phrase(tokens, joined=True)
 
 405         p_title = self.make_phrase(tokens, 'title')
 
 406         p_author = self.make_phrase(tokens, 'author')
 
 408         phrase_level.add(BooleanClause(p_content, Should))
 
 409         phrase_level.add(BooleanClause(p_title, Should))
 
 410         phrase_level.add(BooleanClause(p_author, Should))
 
 412         kw_level = BooleanQuery()
 
 414         kw_level.add(self.make_term_query(tokens, 'author'), Should)
 
 415         kw_level.add(self.make_term_query(tokens, 'themes', joined=True), Should)
 
 416         kw_level.add(self.make_term_query(tokens, 'tags'), Should)
 
 417         kw_level.add(self.make_term_query(tokens, joined=True), Should)
 
 419         top_level.add(BooleanClause(phrase_level, Should))
 
 420         top_level.add(BooleanClause(kw_level, Should))
 
 422         tops = self.searcher.search(top_level, max_results)
 
 424         for found in tops.scoreDocs:
 
 425             doc = self.searcher.doc(found.doc)
 
 426             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
 
 427         return (bks, tops.totalHits)