X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/38f324dec64ba8adffcc795095023557b8a7a39c..e259433948fefe70e35092769540a39a389e8019:/apps/search/index.py?ds=sidebyside diff --git a/apps/search/index.py b/apps/search/index.py index 8ea31240e..a0bf71588 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ +from django.dispatch import Signal +from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \ File, Field, Integer, \ NumericField, Version, Document, JavaError, IndexSearcher, \ QueryParser, PerFieldAnalyzerWrapper, \ @@ -17,7 +18,7 @@ from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ # KeywordAnalyzer # Initialize jvm -JVM = initVM(CLASSPATH) +JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP) import sys import os @@ -27,12 +28,14 @@ from librarian import dcparser from librarian.parser import WLDocument from lxml import etree import catalogue.models -from pdcounter.models import Author as PDCounterAuthor +from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook from multiprocessing.pool import ThreadPool from threading import current_thread +from itertools import chain import atexit import traceback - +import logging +log = logging.getLogger('search') class WLAnalyzer(PerFieldAnalyzerWrapper): def __init__(self): @@ -82,7 +85,7 @@ class IndexStore(object): """ def __init__(self): self.make_index_dir() - self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + self.store = NIOFSDirectory(File(settings.SEARCH_INDEX)) def make_index_dir(self): try: @@ -92,6 +95,9 @@ class IndexStore(object): pass else: raise + def close(self): + self.store.close() + class IndexChecker(IndexStore): def __init__(self): @@ -111,7 +117,7 @@ class Snippets(object): """ SNIPPET_DIR = "snippets" - def __init__(self, book_id): + def __init__(self, book_id, revision=None): try: os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR)) except OSError as exc: @@ -119,15 +125,32 @@ class Snippets(object): pass else: raise self.book_id = book_id + self.revision = revision self.file = None + @property + def path(self): + if self.revision: fn = "%d.%d" % (self.book_id, self.revision) + else: fn = "%d" % self.book_id + + return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn) + def open(self, mode='r'): """ Open the snippet file. Call .close() afterwards. """ if not 'b' in mode: mode += 'b' - self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode) + + if 'w' in mode: + if os.path.exists(self.path): + self.revision = 1 + while True: + if not os.path.exists(self.path): + break + self.revision += 1 + + self.file = open(self.path, mode) self.position = 0 return self @@ -156,6 +179,17 @@ class Snippets(object): """Close snippet file""" self.file.close() + def remove(self): + self.revision = None + try: + os.unlink(self.path) + self.revision = 0 + while True: + self.revision += 1 + os.unlink(self.path) + except OSError: + pass + class BaseIndex(IndexStore): """ @@ -169,11 +203,13 @@ class BaseIndex(IndexStore): analyzer = WLAnalyzer() self.analyzer = analyzer - def open(self, analyzer=None): + def open(self, timeout=None): if self.index: raise Exception("Index is already opened") - self.index = IndexWriter(self.store, self.analyzer,\ - IndexWriter.MaxFieldLength.LIMITED) + conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer) + if timeout: + conf.setWriteLockTimeout(long(timeout)) + self.index = IndexWriter(self.store, conf) return self.index def optimize(self): @@ -183,11 +219,15 @@ class BaseIndex(IndexStore): try: self.index.optimize() except JavaError, je: - print "Error during optimize phase, check index: %s" % je + log.error("Error during optimize phase, check index: %s" % je) self.index.close() self.index = None + index_changed.send_robust(self) + + super(BaseIndex, self).close() + def __enter__(self): self.open() return self @@ -196,6 +236,9 @@ class BaseIndex(IndexStore): self.close() +index_changed = Signal() + + class Index(BaseIndex): """ Class indexing books. @@ -203,31 +246,66 @@ class Index(BaseIndex): def __init__(self, analyzer=None): super(Index, self).__init__(analyzer) - def index_tags(self): + def index_tags(self, *tags, **kw): """ Re-index global tag list. Removes all tags from index, then index them again. Indexed fields include: id, name (with and without polish stems), category """ - q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) - self.index.deleteDocuments(q) + remove_only = kw.get('remove_only', False) + # first, remove tags from index. + if tags: + q = BooleanQuery() + for tag in tags: + b_id_cat = BooleanQuery() - for tag in catalogue.models.Tag.objects.all(): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) - doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) - - for pdtag in PDCounterAuthor.objects.all(): - doc = Document() - doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id))) - doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED)) - doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED)) - doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) - self.index.addDocument(doc) + q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True) + b_id_cat.add(q_id, BooleanClause.Occur.MUST) + + if isinstance(tag, PDCounterAuthor): + q_cat = TermQuery(Term('tag_category', 'pd_author')) + elif isinstance(tag, PDCounterBook): + q_cat = TermQuery(Term('tag_category', 'pd_book')) + else: + q_cat = TermQuery(Term('tag_category', tag.category)) + b_id_cat.add(q_cat, BooleanClause.Occur.MUST) + + q.add(b_id_cat, BooleanClause.Occur.SHOULD) + else: # all + q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True) + self.index.deleteDocuments(q) + + if not remove_only: + # then add them [all or just one passed] + if not tags: + tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \ + PDCounterAuthor.objects.all(), \ + PDCounterBook.objects.all()) + + for tag in tags: + if isinstance(tag, PDCounterAuthor): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + elif isinstance(tag, PDCounterBook): + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED)) + doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) + else: + doc = Document() + doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id))) + doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED)) + doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED)) + self.index.addDocument(doc) def create_book_doc(self, book): """ @@ -239,12 +317,21 @@ class Index(BaseIndex): doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id))) return doc - def remove_book(self, book): + def remove_book(self, book_or_id, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True) + if isinstance(book_or_id, catalogue.models.Book): + book_id = book_or_id.id + else: + book_id = book_or_id + + q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True) self.index.deleteDocuments(q) + if remove_snippets: + snippets = Snippets(book_id) + snippets.remove() + def index_book(self, book, book_info=None, overwrite=True): """ Indexes the book. @@ -252,17 +339,22 @@ class Index(BaseIndex): and calls self.index_content() to index the contents of the book. """ if overwrite: - self.remove_book(book) + # we don't remove snippets, since they might be still needed by + # threads using not reopened index + self.remove_book(book, remove_snippets=False) book_doc = self.create_book_doc(book) - meta_fields = self.extract_metadata(book, book_info) + meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title']) + # let's not index it - it's only used for extracting publish date + if 'source_name' in meta_fields: + del meta_fields['source_name'] + for f in meta_fields.values(): if isinstance(f, list) or isinstance(f, tuple): for elem in f: book_doc.add(elem) else: book_doc.add(f) - self.index.addDocument(book_doc) del book_doc @@ -290,7 +382,7 @@ class Index(BaseIndex): published_date_re = re.compile("([0-9]+)[\]. ]*$") - def extract_metadata(self, book, book_info=None): + def extract_metadata(self, book, book_info=None, dc_only=None): """ Extract metadata from book and returns a map of fields keyed by fieldname """ @@ -305,6 +397,8 @@ class Index(BaseIndex): # validator, name for field in dcparser.BookInfo.FIELDS: + if dc_only and field.name not in dc_only: + continue if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue @@ -416,6 +510,8 @@ class Index(BaseIndex): snip_pos = snippets.add(fields["content"]) doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0])) doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1])) + if snippets.revision: + doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision)) if 'fragment_anchor' in fields: doc.add(Field("fragment_anchor", fields['fragment_anchor'], @@ -537,7 +633,7 @@ def log_exception_wrapper(f): try: f(*a) except Exception, e: - print("Error in indexing thread: %s" % e) + log.error("Error in indexing thread: %s" % e) traceback.print_exc() raise e return _wrap @@ -553,12 +649,11 @@ class ReusableIndex(Index): """ index = None - def open(self, analyzer=None, threads=4): + def open(self, analyzer=None, **kw): if ReusableIndex.index: self.index = ReusableIndex.index else: - print("opening index") - Index.open(self, analyzer) + Index.open(self, analyzer, **kw) ReusableIndex.index = self.index atexit.register(ReusableIndex.close_reusable) @@ -569,11 +664,12 @@ class ReusableIndex(Index): @staticmethod def close_reusable(): if ReusableIndex.index: - print("closing index") ReusableIndex.index.optimize() ReusableIndex.index.close() ReusableIndex.index = None + index_changed.send_robust(None) + def close(self): if ReusableIndex.index: ReusableIndex.index.commit() @@ -640,9 +736,10 @@ class SearchResult(object): self.book_id = int(stored.get("book_id")) pd = stored.get("published_date") - if pd is None: - pd = 0 - self.published_date = int(pd) + try: + self.published_date = int(pd) + except ValueError: + self.published_date = 0 header_type = stored.get("header_type") # we have a content hit in some header of fragment @@ -676,6 +773,8 @@ class SearchResult(object): return self def get_book(self): + if hasattr(self, '_book'): + return self._book return catalogue.models.Book.objects.get(id=self.book_id) book = property(get_book) @@ -694,7 +793,10 @@ class SearchResult(object): # to sections and fragments frags = filter(lambda r: r[FRAGMENT] is not None, self._hits) + sect = filter(lambda r: r[FRAGMENT] is None, self._hits) + + # sections not covered by fragments sect = filter(lambda s: 0 == len(filter( lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX] and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN], @@ -702,15 +804,20 @@ class SearchResult(object): hits = [] - # remove duplicate fragments - fragments = {} - for f in frags: - fid = f[FRAGMENT] - if fid in fragments: - if fragments[fid][SCORE] >= f[SCORE]: - continue - fragments[fid] = f - frags = fragments.values() + def remove_duplicates(lst, keyfn, compare): + els = {} + for e in lst: + eif = keyfn(e) + if eif in els: + if compare(els[eif], e) >= 1: + continue + els[eif] = e + return els.values() + + # remove fragments with duplicated fid's and duplicated snippets + frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT], + lambda a, b: cmp(a[SCORE], b[SCORE])) # remove duplicate sections sections = {} @@ -732,7 +839,7 @@ class SearchResult(object): for f in frags: try: - frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT]) + frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id) except catalogue.models.Fragment.DoesNotExist: # stale index continue @@ -775,7 +882,6 @@ class SearchResult(object): for r in rl: if r.book_id in books: books[r.book_id].merge(r) - #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score)) else: books[r.book_id] = r return books.values() @@ -892,12 +998,31 @@ class Search(IndexStore): IndexStore.__init__(self) self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34) # self.analyzer = WLAnalyzer() - self.searcher = IndexSearcher(self.store, True) + reader = IndexReader.open(self.store, True) + self.searcher = IndexSearcher(reader) self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer) self.parent_filter = TermsFilter() self.parent_filter.addTerm(Term("is_book", "true")) + index_changed.connect(self.reopen) + + def close(self): + reader = self.searcher.getIndexReader() + self.searcher.close() + reader.close() + super(Search, self).close() + index_changed.disconnect(self.reopen) + + def reopen(self, **unused): + reader = self.searcher.getIndexReader() + rdr = reader.reopen() + if not rdr.equals(reader): + log.debug('Reopening index') + oldsearch = self.searcher + self.searcher = IndexSearcher(rdr) + oldsearch.close() + reader.close() def query(self, query): """Parse query in default Lucene Syntax. (for humans) @@ -941,7 +1066,8 @@ class Search(IndexStore): return toks - def fuzziness(self, fuzzy): + @staticmethod + def fuzziness(fuzzy): """Helper method to sanitize fuzziness""" if not fuzzy: return None @@ -962,7 +1088,6 @@ class Search(IndexStore): fuzzterms = [] while True: - # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8')) ft = fuzzterm.term() if ft: fuzzterms.append(ft) @@ -979,7 +1104,8 @@ class Search(IndexStore): phrase.add(term) return phrase - def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): + @staticmethod + def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False): """ Returns term queries joined by boolean query. modal - applies to boolean query @@ -1133,7 +1259,6 @@ class Search(IndexStore): topDocs = self.searcher.search(q, only_in, max_results) for found in topDocs.scoreDocs: books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched)) - print "* %s theme x content: %s" % (searched, books[-1]._hits) # query themes/content x author/title/tags q = BooleanQuery() @@ -1152,7 +1277,6 @@ class Search(IndexStore): topDocs = self.searcher.search(q, only_in, max_results) for found in topDocs.scoreDocs: books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched)) - print "* %s scatter search: %s" % (searched, books[-1]._hits) return books @@ -1211,9 +1335,19 @@ class Search(IndexStore): length = stored.get('snippets_length') if position is None or length is None: return None + revision = stored.get('snippets_revision') + if revision: revision = int(revision) + # locate content. book_id = int(stored.get('book_id')) - snippets = Snippets(book_id).open() + snippets = Snippets(book_id, revision=revision) + + try: + snippets.open() + except IOError, e: + log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e)) + return [] + try: try: text = snippets.get((int(position), @@ -1250,38 +1384,53 @@ class Search(IndexStore): if terms: return JArray('object')(terms, Term) - def search_tags(self, query, filters=None, max_results=40, pdcounter=False): + def search_tags(self, query, filt=None, max_results=40, pdcounter=False): """ Search for Tag objects using query. """ if not pdcounter: - filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)]) - tops = self.searcher.search(query, filters, max_results) + filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)]) + tops = self.searcher.search(query, filt, max_results) tags = [] for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) is_pdcounter = doc.get('is_pdcounter') - if is_pdcounter: - tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) - else: - tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) - # don't add the pdcounter tag if same tag already exists - if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)): + category = doc.get('tag_category') + try: + if is_pdcounter == 'true': + if category == 'pd_author': + tag = PDCounterAuthor.objects.get(id=doc.get('tag_id')) + elif category == 'pd_book': + tag = PDCounterBook.objects.get(id=doc.get('tag_id')) + tag.category = 'pd_book' # make it look more lik a tag. + else: + print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category) + else: + tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id")) + # don't add the pdcounter tag if same tag already exists + tags.append(tag) - # print "%s (%d) -> %f" % (tag, tag.id, found.score) - print 'returning %s' % tags + + except catalogue.models.Tag.DoesNotExist: pass + except PDCounterAuthor.DoesNotExist: pass + except PDCounterBook.DoesNotExist: pass + + log.debug('search_tags: %s' % tags) + return tags - def search_books(self, query, filter=None, max_results=10): + def search_books(self, query, filt=None, max_results=10): """ Searches for Book objects using query """ bks = [] - tops = self.searcher.search(query, filter, max_results) + tops = self.searcher.search(query, filt, max_results) for found in tops.scoreDocs: doc = self.searcher.doc(found.doc) - bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + try: + bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + except catalogue.models.Book.DoesNotExist: pass return bks def make_prefix_phrase(self, toks, field):