X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/919a5d574aac85d55b6cfd17f0a81dd5ff56265a..cb91a32c4411dcc5bd3b433536fea0dea64ea493:/apps/search/index.py?ds=sidebyside diff --git a/apps/search/index.py b/apps/search/index.py index a3a62de18..8b0cfb79f 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -6,7 +6,7 @@ from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \ NumericField, Version, Document, JavaError, IndexSearcher, \ QueryParser, PerFieldAnalyzerWrapper, \ SimpleAnalyzer, PolishAnalyzer, ArrayList, \ - KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \ + KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \ BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \ HashSet, BooleanClause, Term, CharTermAttribute, \ PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \ @@ -55,6 +55,7 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("publisher", simple) self.addAnalyzer("author", simple) self.addAnalyzer("is_book", keyword) + # shouldn't the title have two forms? _pl and simple? self.addAnalyzer("themes", simple) self.addAnalyzer("themes_pl", polish) @@ -62,6 +63,8 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("tag_name", simple) self.addAnalyzer("tag_name_pl", polish) + self.addAnalyzer("translators", simple) + self.addAnalyzer("KEYWORD", keyword) self.addAnalyzer("SIMPLE", simple) self.addAnalyzer("POLISH", polish) @@ -117,13 +120,11 @@ class Snippets(object): self.file.write(txt) pos = (self.position, l) self.position += l - print "Snip<%s>%s" %(pos, txt) return pos def get(self, pos): self.file.seek(pos[0], 0) txt = self.file.read(pos[1]).decode('utf-8') - print "got from snippets %d bytes from %s:" % (len(txt), pos) return txt def close(self): @@ -214,7 +215,7 @@ class Index(IndexStore): def extract_metadata(self, book): fields = {} - book_info = dcparser.parse(book.xml_file) + book_info = dcparser.parse(open(book.xml_file.path)) print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident)) @@ -317,6 +318,13 @@ class Index(IndexStore): return doc + def give_me_utf8(s): + if isinstance(s, unicode): + return s.encode('utf-8') + else: + return s + + fragments = {} snippets = Snippets(book.id).open('w') try: @@ -339,13 +347,16 @@ class Index(IndexStore): fragments[fid]['content'].append(start.tail) elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] - fragments[fid]['themes'].append(start.text) + if start.text is not None: + fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(','))) fragments[fid]['content'].append(start.tail) elif start is not None and start.tag == 'end': fid = start.attrib['id'][1:] if fid not in fragments: continue # a broken node, skip it frag = fragments[fid] + if frag['themes'] == []: + continue # empty themes list. del fragments[fid] def jstr(l): @@ -486,6 +497,7 @@ class Search(IndexStore): bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) return (bks, tops.totalHits) + def search(self, query, max_results=50): query = self.query(query) query = self.wrapjoins(query, ["content", "themes"]) @@ -560,26 +572,36 @@ class SearchResult(object): book = property(get_book) - def get_parts(self): - book = self.book + def process_hits(self): + frags = filter(lambda r: r[1] is not None, self.hits) + sect = filter(lambda r: r[1] is None, self.hits) + sect = filter(lambda s: 0 == len(filter( + lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2], + frags)), sect) + + hits = [] - def sections_covered(results): - frags = filter(lambda r: r[1] is not None, results) - sect = filter(lambda r: r[1] is None, results) - sect = filter(lambda s: 0 == len(filter( - lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2], - frags)), sect) - print "filtered, non overlapped sections: %s" % sect - return frags + sect + for s in sect: + m = {'score': s[2], + 'header_index': s[0][1] + } + m.update(s[3]) + hits.append(m) - parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \ - + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments] + for f in frags: + frag = catalogue.models.Fragment.objects.get(anchor=f[1]) + m = {'score': f[2], + 'fragment': frag, + 'themes': frag.tags.filter(category='theme') + } + m.update(f[3]) + hits.append(m) - parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']])) - print("bookid: %d parts: %s" % (self.book_id, parts)) - return parts + hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True) - parts = property(get_parts) + print("--- %s" % hits) + + return hits def __unicode__(self): return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score) @@ -605,10 +627,10 @@ class Hint(object): self.search = search self.book_tags = {} self.part_tags = [] - self.book = None + self._books = [] - def book(self, book): - self.book = book + def books(self, *books): + self._books = books def tags(self, tags): for t in tags: @@ -639,13 +661,21 @@ class Hint(object): return None def part_filter(self): + fs = [] if self.part_tags: - return self.tag_filter(self.part_tags, field='themes') - else: - return None - + fs.append(self.tag_filter(self.part_tags, field='themes')) + + if self._books != []: + bf = BooleanFilter() + for b in self._books: + id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True) + bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD)) + fs.append(bf) + + return MultiSearch.chain_filters(fs) + def should_search_for_book(self): - return self.book is None + return self._books == [] def just_search_in(self, all): """Holds logic to figure out which indexes should be search, when we have some hinst already""" @@ -653,7 +683,7 @@ class Hint(object): for field in all: if field == 'author' and 'author' in self.book_tags: continue - if field == 'title' and self.book is not None: + if field == 'title' and self._books != []: continue if (field == 'themes' or field == 'themes_pl') and self.part_tags: continue @@ -726,9 +756,9 @@ class MultiSearch(Search): q.add(BooleanClause(term, modal)) return q - def content_query(self, query): - return BlockJoinQuery(query, self.parent_filter, - BlockJoinQuery.ScoreMode.Total) + # def content_query(self, query): + # return BlockJoinQuery(query, self.parent_filter, + # BlockJoinQuery.ScoreMode.Total) def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): fields_to_search = ['author', 'title'] @@ -781,42 +811,30 @@ class MultiSearch(Search): tokens = self.get_tokens(searched) if hint is None or hint.just_search_in(['themes_pl']) != []: - q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl', fuzzy=fuzzy), BooleanClause.Occur.MUST)) + q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl', + fuzzy=fuzzy), BooleanClause.Occur.MUST)) - q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) + q.add(BooleanClause(self.make_term_query(tokens, field='content', + fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) topDocs = self.searcher.search(q, only_in, max_results) for found in topDocs.scoreDocs: books.append(SearchResult(self.searcher, found)) - # joined query themes/content x author/title/epochs/genres/kinds - # q = BooleanQuery() - # in_meta = BooleanQuery() - # in_content = BooleanQuery() - - # for fld in ['themes', 'content']: - # in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD)) - - # in_meta.add(BooleanClause(self.make_term_query( - # self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) - - # for fld in ['title', 'epochs', 'genres', 'kinds']: - # in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD)) + # query themes/content x author/title/tags + q = BooleanQuery() + in_meta = BooleanQuery() + in_content = BooleanQuery() - # q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST)) - # in_content_join = self.content_query(in_content) - # q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST)) - # # import pdb; pdb.set_trace() - # collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True) + for fld in ['themes', 'content', 'tags', 'author', 'title']: + in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD)) - # self.searcher.search(q, collector) + topDocs = self.searcher.search(q, only_in, max_results) + for found in topDocs.scoreDocs: + books.append(SearchResult(self.searcher, found)) - # top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True) - # if top_groups: - # for grp in top_groups.groups: - # for part in grp.scoreDocs: - # books.append(SearchResult(self.searcher, part, score=grp.maxScore)) return books + def multisearch(self, query, max_results=50): """ @@ -890,7 +908,6 @@ class MultiSearch(Search): # highlighter.getBestTextFragments(tokenStream, text, False, 10) # import pdb; pdb.set_trace() snip = highlighter.getBestFragments(tokenStream, text, 3, "...") - print('snips: %s' % snip) return [snip] @@ -923,6 +940,14 @@ class MultiSearch(Search): return tags + def search_books(self, query, filter=None, max_results=10): + bks = [] + tops = self.searcher.search(query, filter, max_results) + for found in tops.scoreDocs: + doc = self.searcher.doc(found.doc) + bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + return bks + def create_prefix_phrase(self, toks, field): q = MultiPhraseQuery() for i in range(len(toks)):