X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/5dbe16a08d4f68377aab0013b17d7094267cf98a..dbb6eb3883a5f5e371f4bf7c89e74326feca0fd1:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 47583ec32..9d6d59861 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ +from django.dispatch import Signal +from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \ File, Field, Integer, \ NumericField, Version, Document, JavaError, IndexSearcher, \ QueryParser, PerFieldAnalyzerWrapper, \ @@ -27,7 +28,7 @@ from librarian import dcparser from librarian.parser import WLDocument from lxml import etree import catalogue.models -from pdcounter.models import Author as PDCounterAuthor +from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from multiprocessing.pool import ThreadPool from threading import current_thread import atexit @@ -82,7 +83,7 @@ class IndexStore(object): """ def __init__(self): self.make_index_dir() - self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + self.store = NIOFSDirectory(File(settings.SEARCH_INDEX)) def make_index_dir(self): try: @@ -92,6 +93,9 @@ class IndexStore(object): pass else: raise + def close(self): + self.store.close() + class IndexChecker(IndexStore): def __init__(self): @@ -111,7 +115,7 @@ class Snippets(object): """ SNIPPET_DIR = "snippets" - def __init__(self, book_id): + def __init__(self, book_id, revision=None): try: os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR)) except OSError as exc: @@ -119,15 +123,33 @@ class Snippets(object): pass else: raise self.book_id = book_id + self.revision = revision self.file = None + @property + def path(self): + if self.revision: fn = "%d.%d" % (self.book_id, self.revision) + else: fn = "%d" % self.book_id + + return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn) + def open(self, mode='r'): """ Open the snippet file. Call .close() afterwards. """ if not 'b' in mode: mode += 'b' - self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode) + + if 'w' in mode: + if os.path.exists(self.path): + self.revision = 1 + while True: + if not os.path.exists(self.path): + break + self.revision += 1 + print "using %s" % self.path + + self.file = open(self.path, mode) self.position = 0 return self @@ -156,6 +178,17 @@ class Snippets(object): """Close snippet file""" self.file.close() + def remove(self): + self.revision = None + try: + os.unlink(self.path) + self.revision = 0 + while True: + self.revision += 1 + os.unlink(self.path) + except OSError: + pass + class BaseIndex(IndexStore): """ @@ -169,11 +202,13 @@ class BaseIndex(IndexStore): analyzer = WLAnalyzer() self.analyzer = analyzer - def open(self, analyzer=None): + def open(self, timeout=None): if self.index: raise Exception("Index is already opened") - self.index = IndexWriter(self.store, self.analyzer,\ - IndexWriter.MaxFieldLength.LIMITED) + conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer) + if timeout: + conf.setWriteLockTimeout(long(timeout)) + self.index = IndexWriter(self.store, conf) return self.index def optimize(self): @@ -188,6 +223,10 @@ class BaseIndex(IndexStore): self.index.close() self.index = None + index_changed.send_robust(self) + + super(BaseIndex, self).close() + def __enter__(self): self.open() return self @@ -196,6 +235,9 @@ class BaseIndex(IndexStore): self.close() +index_changed = Signal() + + class Index(BaseIndex): """ Class indexing books. @@ -203,31 +245,66 @@ class Index(BaseIndex): def __init__(self, analyzer=None): super(Index, self).__init__(analyzer) - def index_tags(self): + def index_tags(self, *tags, **kw): """ Re-index global tag list. Removes all tags from index, then index them again. Indexed fields include: id, name (with and without polish stems), category """ - q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) - self.index.deleteDocuments(q) + remove_only = kw.get('remove_only', False) + # first, remove tags from index. + if tags: + q = BooleanQuery() + for tag in tags: + b_id_cat = BooleanQuery() + + q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True) + b_id_cat.add(q_id, BooleanClause.Occur.MUST) - for tag in catalogue.models.Tag.objects.all(): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) - doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) - - for pdtag in PDCounterAuthor.objects.all(): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) - doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED)) - doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) + if isinstance(tag, PDCounterAuthor): + q_cat = TermQuery(Term('tag_category', 'pd_author')) + elif isinstance(tag, PDCounterBook): + q_cat = TermQuery(Term('tag_category', 'pd_book')) + else: + q_cat = TermQuery(Term('tag_category', tag.category)) + b_id_cat.add(q_cat, BooleanClause.Occur.MUST) + + q.add(b_id_cat, BooleanClause.Occur.SHOULD) + else: # all + q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) + self.index.deleteDocuments(q) + + if not remove_only: + # then add them [all or just one passed] + if not tags: + tags = catalogue.models.Tag.objects.exclude(category='set') + \ + PDCounterAuthor.objects.all() + \ + PDCounterBook.objects.all() + + for tag in tags: + if isinstance(tag, PDCounterAuthor): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + elif isinstance(tag, PDCounterBook): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + else: + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) def create_book_doc(self, book): """ @@ -239,12 +316,16 @@ class Index(BaseIndex): doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) return doc - def remove_book(self, book): + def remove_book(self, book, remove_snippets=True): """Removes a book from search index. book - Book instance.""" q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True) self.index.deleteDocuments(q) + if remove_snippets: + snippets = Snippets(book.id) + snippets.remove() + def index_book(self, book, book_info=None, overwrite=True): """ Indexes the book. @@ -252,7 +333,9 @@ class Index(BaseIndex): and calls self.index_content() to index the contents of the book. """ if overwrite: - self.remove_book(book) + # we don't remove snippets, since they might be still needed by + # threads using not reopened index + self.remove_book(book, remove_snippets=False) book_doc = self.create_book_doc(book) meta_fields = self.extract_metadata(book, book_info) @@ -262,7 +345,6 @@ class Index(BaseIndex): book_doc.add(elem) else: book_doc.add(f) - self.index.addDocument(book_doc) del book_doc @@ -331,11 +413,13 @@ class Index(BaseIndex): (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED) # get published date - source = book_info.source_name - if hasattr(book_info, 'source_name'): - match = self.published_date_re.search(source) + pd = None + if hasattr(book_info, 'source_name') and book_info.source_name: + match = self.published_date_re.search(book_info.source_name) if match is not None: - fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED) + pd = str(match.groups()[0]) + if not pd: pd = "" + fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED) return fields @@ -370,11 +454,18 @@ class Index(BaseIndex): return [] def walker(node, ignore_tags=[]): - yield node, None - for child in filter(lambda n: n.tag not in ignore_tags, list(node)): - for b, e in walker(child): - yield b, e - yield None, node + + if node.tag not in ignore_tags: + yield node, None, None + if node.text is not None: + yield None, node.text, None + for child in list(node): + for b, t, e in walker(child): + yield b, t, e + yield None, None, node + + if node.tail is not None: + yield None, node.tail, None return def fix_format(text): @@ -401,12 +492,16 @@ class Index(BaseIndex): .setIntValue('header_span' in fields and fields['header_span'] or 1)) doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED)) + print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content']) + doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \ Field.TermVector.WITH_POSITIONS_OFFSETS)) snip_pos = snippets.add(fields["content"]) doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0])) doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1])) + if snippets.revision: + doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision)) if 'fragment_anchor' in fields: doc.add(Field("fragment_anchor", fields['fragment_anchor'], @@ -446,35 +541,50 @@ class Index(BaseIndex): # section content content = [] - footnote = None - - for start, end in walker(header, ignore_tags=self.ignore_content_tags): - # handle footnotes - # if start is not None and start.tag in self.footnote_tags: - # footnote = ' '.join(start.itertext()) - # elif end is not None and footnote is not None and end.tag in self.footnote_tags: - # doc = add_part(snippets, header_index=position, header_type=header.tag, - # content=footnote) + footnote = [] - # self.index.addDocument(doc) + def all_content(text): + for frag in fragments.values(): + frag['content'].append(text) + content.append(text) + handle_text = [all_content] - # footnote = None + for start, text, end in walker(header, ignore_tags=self.ignore_content_tags): + # handle footnotes + if start is not None and start.tag in self.footnote_tags: + footnote = [] + def collect_footnote(t): + footnote.append(t) + handle_text.append(collect_footnote) + elif end is not None and footnote is not [] and end.tag in self.footnote_tags: + handle_text.pop() + doc = add_part(snippets, header_index=position, header_type=header.tag, + content=u''.join(footnote), + is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)) + + self.index.addDocument(doc) + #print "@ footnote text: %s" % footnote + footnote = [] + # handle fragments and themes. if start is not None and start.tag == 'begin': fid = start.attrib['id'][1:] fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag} + # themes for this fragment elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] + handle_text.append(None) if start.text is not None: fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(','))) + elif end is not None and end.tag == 'motyw': + handle_text.pop() elif start is not None and start.tag == 'end': fid = start.attrib['id'][1:] if fid not in fragments: continue # a broken node, skip it - # import pdb; pdb.set_trace() frag = fragments[fid] if frag['themes'] == []: continue # empty themes list. @@ -487,22 +597,20 @@ class Index(BaseIndex): fragment_anchor=fid, content=fix_format(frag['content']), themes=frag['themes']) - + #print '@ FRAG %s' % frag['content'] self.index.addDocument(doc) # Collect content. - elif start is not None: - for frag in fragments.values(): - frag['content'].append(start.text) - content.append(start.text) - elif end is not None: - for frag in fragments.values(): - frag['content'].append(end.tail) - content.append(end.tail) + + if text is not None and handle_text is not []: + hdl = handle_text[-1] + if hdl is not None: + hdl(text) # in the end, add a section text. doc = add_part(snippets, header_index=position, header_type=header.tag, content=fix_format(content)) + #print '@ CONTENT: %s' % fix_format(content) self.index.addDocument(doc) @@ -531,12 +639,12 @@ class ReusableIndex(Index): """ index = None - def open(self, analyzer=None, threads=4): - if ReusableIndex.index is not None: + def open(self, analyzer=None, **kw): + if ReusableIndex.index: self.index = ReusableIndex.index else: print("opening index") - Index.open(self, analyzer) + Index.open(self, analyzer, **kw) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) @@ -546,13 +654,17 @@ class ReusableIndex(Index): @staticmethod def close_reusable(): - if ReusableIndex.index is not None: + if ReusableIndex.index: + print("closing index") ReusableIndex.index.optimize() ReusableIndex.index.close() ReusableIndex.index = None + index_changed.send_robust(None) + def close(self): - pass + if ReusableIndex.index: + ReusableIndex.index.commit() class JoinSearch(object): @@ -616,9 +728,10 @@ class SearchResult(object): self.book_id = int(stored.get("book_id")) pd = stored.get("published_date") - if pd is None: - pd = 0 - self.published_date = int(pd) + try: + self.published_date = int(pd) + except ValueError: + self.published_date = 0 header_type = stored.get("header_type") # we have a content hit in some header of fragment @@ -652,6 +765,8 @@ class SearchResult(object): return self def get_book(self): + if hasattr(self, '_book'): + return self._book return catalogue.models.Book.objects.get(id=self.book_id) book = property(get_book) @@ -670,7 +785,10 @@ class SearchResult(object): # to sections and fragments frags = filter(lambda r: r[FRAGMENT] is not None, self._hits) + sect = filter(lambda r: r[FRAGMENT] is None, self._hits) + + # sections not covered by fragments sect = filter(lambda s: 0 == len(filter( lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX] and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN], @@ -678,15 +796,20 @@ class SearchResult(object): hits = [] - # remove duplicate fragments - fragments = {} - for f in frags: - fid = f[FRAGMENT] - if fid in fragments: - if fragments[fid][SCORE] >= f[SCORE]: - continue - fragments[fid] = f - frags = fragments.values() + def remove_duplicates(lst, keyfn, compare): + els = {} + for e in lst: + eif = keyfn(e) + if eif in els: + if compare(els[eif], e) >= 1: + continue + els[eif] = e + return els.values() + + # remove fragments with duplicated fid's and duplicated snippets + frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f), + lambda a, b: cmp(a[SCORE], b[SCORE])) # remove duplicate sections sections = {} @@ -708,7 +831,7 @@ class SearchResult(object): for f in frags: try: - frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT]) + frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id) except catalogue.models.Fragment.DoesNotExist: # stale index continue @@ -868,12 +991,32 @@ class Search(IndexStore): IndexStore.__init__(self) self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34) # self.analyzer = WLAnalyzer() - self.searcher = IndexSearcher(self.store, True) + reader = IndexReader.open(self.store, True) + self.searcher = IndexSearcher(reader) self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer) self.parent_filter = TermsFilter() self.parent_filter.addTerm(Term("is_book", "true")) + index_changed.connect(self.reopen) + + def close(self): + reader = self.searcher.getIndexReader() + self.searcher.close() + reader.close() + super(Search, self).close() + index_changed.disconnect(self.reopen) + + def reopen(self, **unused): + reader = self.searcher.getIndexReader() + rdr = reader.reopen() + print "got signal to reopen index" + if not rdr.equals(reader): + print "will reopen index" + oldsearch = self.searcher + self.searcher = IndexSearcher(rdr) + oldsearch.close() + reader.close() def query(self, query): """Parse query in default Lucene Syntax. (for humans) @@ -1187,18 +1330,28 @@ class Search(IndexStore): length = stored.get('snippets_length') if position is None or length is None: return None + revision = stored.get('snippets_revision') + if revision: revision = int(revision) # locate content. - snippets = Snippets(stored.get('book_id')).open() + book_id = int(stored.get('book_id')) + snippets = Snippets(book_id, revision=revision).open() try: - text = snippets.get((int(position), - int(length))) - finally: - snippets.close() + try: + text = snippets.get((int(position), + int(length))) + finally: + snippets.close() - tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) - # highlighter.getBestTextFragments(tokenStream, text, False, 10) - snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer) + # highlighter.getBestTextFragments(tokenStream, text, False, 10) + snip = highlighter.getBestFragments(tokenStream, text, 3, "...") + except Exception, e: + e2 = e + if hasattr(e, 'getJavaException'): + e2 = unicode(e.getJavaException()) + raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)), + e2) return snip @staticmethod @@ -1230,13 +1383,25 @@ class Search(IndexStore): for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) is_pdcounter = doc.get('is_pdcounter') - if is_pdcounter: - tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) - else: - tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) - # don't add the pdcounter tag if same tag already exists - if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)): - tags.append(tag) + category = doc.get('tag_category') + try: + if is_pdcounter == 'true': + if category == 'pd_author': + tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + elif category == 'pd_book': + tag = PDCounterBook.objects.get(id=doc.get('tag_id')) + tag.category = 'pd_book' # make it look more lik a tag. + else: + print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category) + else: + tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) + # don't add the pdcounter tag if same tag already exists + if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)): + tags.append(tag) + except catalogue.models.Tag.DoesNotExist: pass + except PDCounterAuthor.DoesNotExist: pass + except PDCounterBook.DoesNotExist: pass + # print "%s (%d) -> %f" % (tag, tag.id, found.score) print 'returning %s' % tags return tags @@ -1249,7 +1414,9 @@ class Search(IndexStore): tops = self.searcher.search(query, filter, max_results) for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + try: + bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + except catalogue.models.Book.DoesNotExist: pass return bks def make_prefix_phrase(self, toks, field): @@ -1278,7 +1445,7 @@ class Search(IndexStore): return only_term - def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True): + def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False): """ Return auto-complete hints for tags using prefix search. @@ -1290,14 +1457,14 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, field) else: - q = self.make_term_query(toks, field) + q = self.make_term_query(toks, field, fuzzy=fuzzy) top.add(BooleanClause(q, BooleanClause.Occur.SHOULD)) no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True) return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter) - def hint_books(self, string, max_results=50, prefix=True): + def hint_books(self, string, max_results=50, prefix=True, fuzzy=False): """ Returns auto-complete hints for book titles Because we do not index 'pseudo' title-tags. @@ -1308,7 +1475,7 @@ class Search(IndexStore): if prefix: q = self.make_prefix_phrase(toks, 'title') else: - q = self.make_term_query(toks, 'title') + q = self.make_term_query(toks, 'title', fuzzy=fuzzy) return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)