X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/5c959cbb46c29a03cb6c8bc0e8b5aae5765bf150..cb91a32c4411dcc5bd3b433536fea0dea64ea493:/apps/search/index.py diff --git a/apps/search/index.py b/apps/search/index.py index 9b7efa2b6..8b0cfb79f 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -55,6 +55,7 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("publisher", simple) self.addAnalyzer("author", simple) self.addAnalyzer("is_book", keyword) + # shouldn't the title have two forms? _pl and simple? self.addAnalyzer("themes", simple) self.addAnalyzer("themes_pl", polish) @@ -62,6 +63,8 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("tag_name", simple) self.addAnalyzer("tag_name_pl", polish) + self.addAnalyzer("translators", simple) + self.addAnalyzer("KEYWORD", keyword) self.addAnalyzer("SIMPLE", simple) self.addAnalyzer("POLISH", polish) @@ -117,13 +120,11 @@ class Snippets(object): self.file.write(txt) pos = (self.position, l) self.position += l - print "Snip<%s>%s" %(pos, txt) return pos def get(self, pos): self.file.seek(pos[0], 0) txt = self.file.read(pos[1]).decode('utf-8') - print "got from snippets %d bytes from %s:" % (len(txt), pos) return txt def close(self): @@ -214,7 +215,7 @@ class Index(IndexStore): def extract_metadata(self, book): fields = {} - book_info = dcparser.parse(book.xml_file) + book_info = dcparser.parse(open(book.xml_file.path)) print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident)) @@ -317,6 +318,13 @@ class Index(IndexStore): return doc + def give_me_utf8(s): + if isinstance(s, unicode): + return s.encode('utf-8') + else: + return s + + fragments = {} snippets = Snippets(book.id).open('w') try: @@ -339,13 +347,16 @@ class Index(IndexStore): fragments[fid]['content'].append(start.tail) elif start is not None and start.tag == 'motyw': fid = start.attrib['id'][1:] - fragments[fid]['themes'].append(start.text) + if start.text is not None: + fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(','))) fragments[fid]['content'].append(start.tail) elif start is not None and start.tag == 'end': fid = start.attrib['id'][1:] if fid not in fragments: continue # a broken node, skip it frag = fragments[fid] + if frag['themes'] == []: + continue # empty themes list. del fragments[fid] def jstr(l): @@ -486,6 +497,7 @@ class Search(IndexStore): bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) return (bks, tops.totalHits) + def search(self, query, max_results=50): query = self.query(query) query = self.wrapjoins(query, ["content", "themes"]) @@ -615,10 +627,10 @@ class Hint(object): self.search = search self.book_tags = {} self.part_tags = [] - self._book = None + self._books = [] - def book(self, book): - self._book = book + def books(self, *books): + self._books = books def tags(self, tags): for t in tags: @@ -652,12 +664,18 @@ class Hint(object): fs = [] if self.part_tags: fs.append(self.tag_filter(self.part_tags, field='themes')) - if self._book is not None: - fs.append(NumericRangeFilter.newIntRange('book_id', self._book.id, self._book.id, True, True)) + + if self._books != []: + bf = BooleanFilter() + for b in self._books: + id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True) + bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD)) + fs.append(bf) + return MultiSearch.chain_filters(fs) def should_search_for_book(self): - return self._book is None + return self._books == [] def just_search_in(self, all): """Holds logic to figure out which indexes should be search, when we have some hinst already""" @@ -665,7 +683,7 @@ class Hint(object): for field in all: if field == 'author' and 'author' in self.book_tags: continue - if field == 'title' and self._book is not None: + if field == 'title' and self._books != []: continue if (field == 'themes' or field == 'themes_pl') and self.part_tags: continue @@ -738,9 +756,9 @@ class MultiSearch(Search): q.add(BooleanClause(term, modal)) return q - def content_query(self, query): - return BlockJoinQuery(query, self.parent_filter, - BlockJoinQuery.ScoreMode.Total) + # def content_query(self, query): + # return BlockJoinQuery(query, self.parent_filter, + # BlockJoinQuery.ScoreMode.Total) def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): fields_to_search = ['author', 'title'] @@ -890,7 +908,6 @@ class MultiSearch(Search): # highlighter.getBestTextFragments(tokenStream, text, False, 10) # import pdb; pdb.set_trace() snip = highlighter.getBestFragments(tokenStream, text, 3, "...") - print('snips: %s' % snip) return [snip] @@ -923,6 +940,14 @@ class MultiSearch(Search): return tags + def search_books(self, query, filter=None, max_results=10): + bks = [] + tops = self.searcher.search(query, filter, max_results) + for found in tops.scoreDocs: + doc = self.searcher.doc(found.doc) + bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + return bks + def create_prefix_phrase(self, toks, field): q = MultiPhraseQuery() for i in range(len(toks)):