From: Marcin Koziej Date: Tue, 13 Dec 2011 10:26:02 +0000 (+0100) Subject: OPDS advanced search X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/cb91a32c4411dcc5bd3b433536fea0dea64ea493 OPDS advanced search --- diff --git a/apps/opds/views.py b/apps/opds/views.py index c7d38284b..cd91a743f 100644 --- a/apps/opds/views.py +++ b/apps/opds/views.py @@ -5,6 +5,7 @@ from base64 import b64encode import os.path from urlparse import urljoin +from urllib2 import unquote from django.contrib.syndication.views import Feed from django.core.urlresolvers import reverse @@ -16,7 +17,11 @@ from django.contrib.sites.models import Site from basicauth import logged_in_or_basicauth, factory_decorator from catalogue.models import Book, Tag -from catalogue.views import books_starting_with + +from search import MultiSearch, SearchResult, JVM +from lucene import Term, QueryWrapperFilter, TermQuery + +import re from stats.utils import piwik_track @@ -316,20 +321,120 @@ class UserSetFeed(AcquisitionFeed): # no class decorators in python 2.5 #UserSetFeed = factory_decorator(logged_in_or_basicauth())(UserSetFeed) + @piwik_track class SearchFeed(AcquisitionFeed): description = u"Wyniki wyszukiwania na stronie WolneLektury.pl" title = u"Wyniki wyszukiwania" + + INLINE_QUERY_RE = re.compile(r"(author:(?P[^ ]+)|title:(?P[^ ]+)|categories:(?P<categories>[^ ]+)|description:(?P<description>[^ ]+))") def get_object(self, request): - return request.GET.get('q', '') + """ + For OPDS 1.1 We should handle a query for search terms + and atom:author, atom:contributor, atom:title + if search terms are provided, we shall search for books + according to Hint information (from author & contributror & title). + + but if search terms are empty, we should do a different search + (perhaps for is_book=True) + + """ + JVM.attachCurrentThread() + + query = request.GET.get('q', '') + + inline_criteria = re.findall(self.INLINE_QUERY_RE, query) + if inline_criteria: + def get_criteria(criteria, name, position): + e = filter(lambda el: el[0][0:len(name)] == name, criteria) + print e + if not e: + return None + c = e[0][position] + print c + if c[0] == '"' and c[-1] == '"': + c = c[1:-1] + c = c.replace('+', ' ') + return c + + #import pdb; pdb.set_trace() + author = get_criteria(inline_criteria, 'author', 1) + title = get_criteria(inline_criteria, 'title', 2) + translator = None + categories = get_criteria(inline_criteria, 'categories', 3) + query = get_criteria(inline_criteria, 'description', 4) + else: + author = request.GET.get('author', '') + title = request.GET.get('title', '') + translator = request.GET.get('translator', '') + categories = None + fuzzy = False + + + srch = MultiSearch() + hint = srch.hint() + + # Scenario 1: full search terms provided. + # Use auxiliarry information to narrow it and make it better. + if query: + filters = [] + + if author: + print "narrow to author %s" % author + hint.tags(srch.search_tags(author, filter=srch.term_filter(Term('tag_category', 'author')))) + + if translator: + print "filter by translator %s" % translator + filters.append(QueryWrapperFilter( + srch.make_phrase(srch.get_tokens(translator, field='translators'), + field='translators'))) + + if categories: + filters.append(QueryWrapperFilter( + srch.make_phrase(srch.get_tokens(categories, field="tag_name_pl"), + field='tag_name_pl'))) + + flt = srch.chain_filters(filters) + if title: + print "hint by book title %s" % title + q = srch.make_phrase(srch.get_tokens(title, field='title'), field='title') + hint.books(*srch.search_books(q, filter=flt)) + + toks = srch.get_tokens(query) + print "tokens: %s" % toks + # import pdb; pdb.set_trace() + results = SearchResult.aggregate(srch.search_perfect_book(toks, fuzzy=fuzzy, hint=hint), + srch.search_perfect_parts(toks, fuzzy=fuzzy, hint=hint), + srch.search_everywhere(toks, fuzzy=fuzzy, hint=hint)) + results.sort(reverse=True) + return [r.book for r in results] + else: + # Scenario 2: since we no longer have to figure out what the query term means to the user, + # we can just use filters and not the Hint class. + filters = [] + + fields = { + 'author': author, + 'translators': translator, + 'title': title + } + + for fld, q in fields.items(): + if q: + filters.append(QueryWrapperFilter( + srch.make_phrase(srch.get_tokens(q, field=fld), field=fld))) + + flt = srch.chain_filters(filters) + books = srch.search_books(TermQuery(Term('is_book', 'true')), filter=flt) + return books def get_link(self, query): - return "%s?q=%s" % (reverse('search'), query) + return "%s?q=%s" % (reverse('search'), query) - def items(self, query): + def items(self, books): try: - return books_starting_with(query) + return books except ValueError: # too short a query return [] diff --git a/apps/search/index.py b/apps/search/index.py index 6e2140ac1..8b0cfb79f 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -55,6 +55,7 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("publisher", simple) self.addAnalyzer("author", simple) self.addAnalyzer("is_book", keyword) + # shouldn't the title have two forms? _pl and simple? self.addAnalyzer("themes", simple) self.addAnalyzer("themes_pl", polish) @@ -62,6 +63,8 @@ class WLAnalyzer(PerFieldAnalyzerWrapper): self.addAnalyzer("tag_name", simple) self.addAnalyzer("tag_name_pl", polish) + self.addAnalyzer("translators", simple) + self.addAnalyzer("KEYWORD", keyword) self.addAnalyzer("SIMPLE", simple) self.addAnalyzer("POLISH", polish) @@ -212,7 +215,7 @@ class Index(IndexStore): def extract_metadata(self, book): fields = {} - book_info = dcparser.parse(book.xml_file) + book_info = dcparser.parse(open(book.xml_file.path)) print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident)) @@ -494,6 +497,7 @@ class Search(IndexStore): bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) return (bks, tops.totalHits) + def search(self, query, max_results=50): query = self.query(query) query = self.wrapjoins(query, ["content", "themes"]) @@ -623,10 +627,10 @@ class Hint(object): self.search = search self.book_tags = {} self.part_tags = [] - self._book = None + self._books = [] - def book(self, book): - self._book = book + def books(self, *books): + self._books = books def tags(self, tags): for t in tags: @@ -660,12 +664,18 @@ class Hint(object): fs = [] if self.part_tags: fs.append(self.tag_filter(self.part_tags, field='themes')) - if self._book is not None: - fs.append(NumericRangeFilter.newIntRange('book_id', self._book.id, self._book.id, True, True)) + + if self._books != []: + bf = BooleanFilter() + for b in self._books: + id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True) + bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD)) + fs.append(bf) + return MultiSearch.chain_filters(fs) def should_search_for_book(self): - return self._book is None + return self._books == [] def just_search_in(self, all): """Holds logic to figure out which indexes should be search, when we have some hinst already""" @@ -673,7 +683,7 @@ class Hint(object): for field in all: if field == 'author' and 'author' in self.book_tags: continue - if field == 'title' and self._book is not None: + if field == 'title' and self._books != []: continue if (field == 'themes' or field == 'themes_pl') and self.part_tags: continue @@ -746,9 +756,9 @@ class MultiSearch(Search): q.add(BooleanClause(term, modal)) return q - def content_query(self, query): - return BlockJoinQuery(query, self.parent_filter, - BlockJoinQuery.ScoreMode.Total) + # def content_query(self, query): + # return BlockJoinQuery(query, self.parent_filter, + # BlockJoinQuery.ScoreMode.Total) def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None): fields_to_search = ['author', 'title'] @@ -930,6 +940,14 @@ class MultiSearch(Search): return tags + def search_books(self, query, filter=None, max_results=10): + bks = [] + tops = self.searcher.search(query, filter, max_results) + for found in tops.scoreDocs: + doc = self.searcher.doc(found.doc) + bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id"))) + return bks + def create_prefix_phrase(self, toks, field): q = MultiPhraseQuery() for i in range(len(toks)): diff --git a/apps/search/views.py b/apps/search/views.py index c50077005..fad5e6f61 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -124,7 +124,7 @@ def main(request): context_instance=RequestContext(request)) hint.tags(tag_list) - hint.book(book) + hint.books(book) toks = StringReader(query) fuzzy = 'fuzzy' in request.GET diff --git a/wolnelektury/settings.py b/wolnelektury/settings.py index 814861436..826abb2ca 100644 --- a/wolnelektury/settings.py +++ b/wolnelektury/settings.py @@ -230,9 +230,9 @@ API_WAIT = 10 # limit number of filtering tags MAX_TAG_LIST = 6 -NO_BUILD_EPUB = True +NO_BUILD_EPUB = False NO_BUILD_TXT = False -NO_BUILD_PDF = True +NO_BUILD_PDF = False NO_BUILD_MOBI = True NO_SEARCH_INDEX = False SEARCH_INDEX_PARALLEL = False diff --git a/wolnelektury/static/opensearch.xml b/wolnelektury/static/opensearch.xml index 9cc056541..a603966ab 100644 --- a/wolnelektury/static/opensearch.xml +++ b/wolnelektury/static/opensearch.xml @@ -11,7 +11,7 @@ <Image height="16" width="16" type="image/x-icon">http://www.wolnelektury.pl/static/img/favicon.ico</Image> <Image height="64" width="64" type="image/png">http://www.wolnelektury.pl/static/img/wl_icon_64.png</Image> <Url type="application/atom+xml;profile=opds-catalog" - template="http://www.wolnelektury.pl/opds/search/?q={searchTerms}" /> + template="http://www.wolnelektury.pl/opds/search/?q={searchTerms}&author={atom:author}&translator={atom:contributor}&title={atom:title}" /> <Url type="text/html" method="GET" template="http://www.wolnelektury.pl/katalog/szukaj?q={searchTerms}" /> <Url type="application/x-suggestions+json" method="GET" template="http://www.wolnelektury.pl/katalog/jtags?mozhint=1&q={searchTerms}" /> <moz:SearchForm>http://www.wolnelektury.pl/katalog/</moz:SearchForm>