Some search fixes.
authorMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Thu, 26 Jan 2012 13:16:29 +0000 (14:16 +0100)
committerMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Thu, 26 Jan 2012 13:16:29 +0000 (14:16 +0100)
apps/catalogue/views.py
apps/search/index.py
apps/search/templatetags/search_tags.py
apps/search/views.py
wolnelektury/static/css/book_box.css
wolnelektury/templates/catalogue/book_searched.html
wolnelektury/templates/catalogue/search_multiple_hits.html
wolnelektury/templates/catalogue/search_no_hits.html

index 20fecb7..14147c8 100644 (file)
@@ -103,7 +103,6 @@ def differentiate_tags(request, tags, ambiguous_slugs):
 
 
 def tagged_object_list(request, tags=''):
 
 
 def tagged_object_list(request, tags=''):
-    #    import pdb; pdb.set_trace()
     try:
         tags = models.Tag.get_tag_list(tags)
     except models.Tag.DoesNotExist:
     try:
         tags = models.Tag.get_tag_list(tags)
     except models.Tag.DoesNotExist:
index adb7679..33836ad 100644 (file)
@@ -256,7 +256,7 @@ class Index(BaseIndex):
         self.index.addDocument(book_doc)
         del book_doc
 
         self.index.addDocument(book_doc)
         del book_doc
 
-        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
+        self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
 
     master_tags = [
         'opowiadanie',
 
     master_tags = [
         'opowiadanie',
@@ -264,11 +264,22 @@ class Index(BaseIndex):
         'dramat_wierszowany_l',
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
         'dramat_wierszowany_l',
         'dramat_wierszowany_lp',
         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
-        'wywiad'
+        'wywiad',
         ]
 
         ]
 
+    ignore_content_tags = [
+        'uwaga', 'extra',
+        'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
+        'didaskalia',
+        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
+        ]
+
+    footnote_tags = ['pa', 'pt', 'pr', 'pe']
+
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
 
+    published_date_re = re.compile("([0-9]+)[\]. ]*$")
+
     def extract_metadata(self, book, book_info=None):
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
     def extract_metadata(self, book, book_info=None):
         """
         Extract metadata from book and returns a map of fields keyed by fieldname
@@ -309,6 +320,13 @@ class Index(BaseIndex):
                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
                     fields[field.name] = Field(field.name, "%04d%02d%02d" %\
                                                (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
 
+        # get published date
+        source = book_info.source_name
+        match = self.published_date_re.search(source)
+        print("published date is %s %s" % (match, match is not None and match.groups()))
+        if match is not None:
+            fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
+
         return fields
 
     def add_gaps(self, fields, fieldname):
         return fields
 
     def add_gaps(self, fields, fieldname):
@@ -341,15 +359,26 @@ class Index(BaseIndex):
         if master is None:
             return []
 
         if master is None:
             return []
 
-        def walker(node):
+        def walker(node, ignore_tags=[]):
             yield node, None
             yield node, None
-            for child in list(node):
+            for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
                 for b, e in walker(child):
                     yield b, e
             yield None, node
             return
 
         def fix_format(text):
                 for b, e in walker(child):
                     yield b, e
             yield None, node
             return
 
         def fix_format(text):
+            #            separator = [u" ", u"\t", u".", u";", u","]
+            if isinstance(text, list):
+                # need to join it first
+                text = filter(lambda s: s is not None, content)
+                text = u' '.join(text)
+                # for i in range(len(text)):
+                #     if i > 0:
+                #         if text[i][0] not in separator\
+                #             and text[i - 1][-1] not in separator:
+                #          text.insert(i, u" ")
+
             return re.sub("(?m)/$", "", text)
 
         def add_part(snippets, **fields):
             return re.sub("(?m)/$", "", text)
 
         def add_part(snippets, **fields):
@@ -408,9 +437,21 @@ class Index(BaseIndex):
 
                 # section content
                 content = []
 
                 # section content
                 content = []
+                footnote = None
+
+                for start, end in walker(header, ignore_tags=self.ignore_content_tags):
+                    # handle footnotes
+                    if start is not None and start.tag in self.footnote_tags:
+                        footnote = ' '.join(start.itertext())
+                    elif end is not None and footnote is not None and end.tag in self.footnote_tags:
+                        doc = add_part(snippets, header_index=position, header_type=header.tag,
+                                       content=footnote)
+
+                        self.index.addDocument(doc)
+
+                        footnote = None
 
 
-                for start, end in walker(header):
-                        # handle fragments and themes.
+                    # handle fragments and themes.
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
                     if start is not None and start.tag == 'begin':
                         fid = start.attrib['id'][1:]
                         fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
@@ -430,17 +471,12 @@ class Index(BaseIndex):
                             continue  # empty themes list.
                         del fragments[fid]
 
                             continue  # empty themes list.
                         del fragments[fid]
 
-                        def jstr(l):
-                            return u' '.join(map(
-                                lambda x: x == None and u'(none)' or unicode(x),
-                                l))
-
                         doc = add_part(snippets,
                                        header_type=frag['start_header'],
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
                         doc = add_part(snippets,
                                        header_type=frag['start_header'],
                                        header_index=frag['start_section'],
                                        header_span=position - frag['start_section'] + 1,
                                        fragment_anchor=fid,
-                                       content=u' '.join(filter(lambda s: s is not None, frag['content'])),
+                                       content=fix_format(frag['content']),
                                        themes=frag['themes'])
 
                         self.index.addDocument(doc)
                                        themes=frag['themes'])
 
                         self.index.addDocument(doc)
@@ -457,7 +493,7 @@ class Index(BaseIndex):
 
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position, header_type=header.tag,
 
                         # in the end, add a section text.
                 doc = add_part(snippets, header_index=position, header_type=header.tag,
-                               content=fix_format(u' '.join(filter(lambda s: s is not None, content))))
+                               content=fix_format(content))
 
                 self.index.addDocument(doc)
                 position += 1
 
                 self.index.addDocument(doc)
                 position += 1
@@ -555,20 +591,20 @@ class JoinSearch(object):
 
 
 class SearchResult(object):
 
 
 class SearchResult(object):
-    def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
+    def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
         if tokens_cache is None: tokens_cache = {}
         if tokens_cache is None: tokens_cache = {}
-            
+
         if score:
             self._score = score
         else:
             self._score = scoreDocs.score
         if score:
             self._score = score
         else:
             self._score = scoreDocs.score
-            
+
         self.boost = 1.0
 
         self._hits = []
         self._processed_hits = None  # processed hits
 
         self.boost = 1.0
 
         self._hits = []
         self._processed_hits = None  # processed hits
 
-        stored = searcher.doc(scoreDocs.doc)
+        stored = search.searcher.doc(scoreDocs.doc)
         self.book_id = int(stored.get("book_id"))
 
         header_type = stored.get("header_type")
         self.book_id = int(stored.get("book_id"))
 
         header_type = stored.get("header_type")
@@ -581,13 +617,19 @@ class SearchResult(object):
 
         fragment = stored.get("fragment_anchor")
 
 
         fragment = stored.get("fragment_anchor")
 
+        pd = stored.get("published_date")
+        if pd is None:
+            print "published_date is none for book %d" % self.book_id
+            pd = 0
+        self.published_date = int(pd)
+
         if snippets:
             snippets = snippets.replace("/\n", "\n")
         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
         self._hits.append(hit)
 
         if snippets:
             snippets = snippets.replace("/\n", "\n")
         hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
 
         self._hits.append(hit)
 
-        self.searcher = searcher
+        self.search = search
         self.searched = searched
         self.tokens_cache = tokens_cache
 
         self.searched = searched
         self.tokens_cache = tokens_cache
 
@@ -669,11 +711,12 @@ class SearchResult(object):
             themes = frag.tags.filter(category='theme')
             themes_hit = []
             if self.searched is not None:
             themes = frag.tags.filter(category='theme')
             themes_hit = []
             if self.searched is not None:
-                tokens = self.searcher.get_tokens(self.searched, 'POLISH', tokens_cache=self.tokens_cache)
+                tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
                 for theme in themes:
                 for theme in themes:
-                    name_tokens = self.searcher.get_tokens(theme.name, 'POLISH')
+                    name_tokens = self.search.get_tokens(theme.name, 'POLISH')
+                    print "THEME HIT: %s in %s" % (tokens, name_tokens)
                     for t in tokens:
                     for t in tokens:
-                        if name_tokens.index(t):
+                        if t in name_tokens:
                             if not theme in themes_hit:
                                 themes_hit.append(theme)
                             break
                             if not theme in themes_hit:
                                 themes_hit.append(theme)
                             break
@@ -709,7 +752,12 @@ class SearchResult(object):
         return books.values()
 
     def __cmp__(self, other):
         return books.values()
 
     def __cmp__(self, other):
-        return cmp(self.score, other.score)
+        c = cmp(self.score, other.score)
+        if c == 0:
+            # this is inverted, because earlier date is better
+            return cmp(other.published_date, self.published_date)
+        else:
+            return c
 
 
 class Hint(object):
 
 
 class Hint(object):
@@ -930,7 +978,7 @@ class Search(IndexStore):
             filters.append(self.term_filter(Term('is_book', 'true')))
         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
             filters.append(self.term_filter(Term('is_book', 'true')))
         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
-        return [SearchResult(self.searcher, found, snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
+        return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
 
     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
                     filters=None, tokens_cache=None, boost=None):
 
     def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
                     filters=None, tokens_cache=None, boost=None):
@@ -950,7 +998,8 @@ class Search(IndexStore):
 
         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
 
         top = self.searcher.search(query, self.chain_filters(filters), max_results)
 
-        return [SearchResult(self.searcher, found, searched=searched, tokens_cache=tokens_cache, snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
+        return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
+                             snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
 
     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
         """
 
     def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
         """
@@ -973,7 +1022,7 @@ class Search(IndexStore):
                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                 max_results)
             for found in top.scoreDocs:
                 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
                 max_results)
             for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found, how_found="search_perfect_book"))
+                books.append(SearchResult(self, found, how_found="search_perfect_book"))
         return books
 
     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
         return books
 
     def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
@@ -999,7 +1048,7 @@ class Search(IndexStore):
                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
             max_results)
         for found in top.scoreDocs:
                                    self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
             max_results)
         for found in top.scoreDocs:
-            books.append(SearchResult(self.searcher, found, how_found="search_book"))
+            books.append(SearchResult(self, found, how_found="search_book"))
 
         return books
 
 
         return books
 
@@ -1021,7 +1070,7 @@ class Search(IndexStore):
                                                            flt]),
                                        max_results)
             for found in top.scoreDocs:
                                                            flt]),
                                        max_results)
             for found in top.scoreDocs:
-                books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
+                books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
 
         return books
 
 
         return books
 
@@ -1054,7 +1103,7 @@ class Search(IndexStore):
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found, how_found='search_everywhere_themesXcontent'))
+            books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
         # query themes/content x author/title/tags
             print "* %s theme x content: %s" % (searched, books[-1]._hits)
 
         # query themes/content x author/title/tags
@@ -1073,7 +1122,7 @@ class Search(IndexStore):
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
 
         topDocs = self.searcher.search(q, only_in, max_results)
         for found in topDocs.scoreDocs:
-            books.append(SearchResult(self.searcher, found, how_found='search_everywhere'))
+            books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
         return books
             print "* %s scatter search: %s" % (searched, books[-1]._hits)
 
         return books
index 32d5b64..5bbc422 100644 (file)
@@ -44,6 +44,7 @@ def book_searched(context, result):
     return {
         'related': book.related_info(),
         'book': book,
     return {
         'related': book.related_info(),
         'book': book,
+        'main_link': book.get_absolute_url(),
         'request': context.get('request'),
         'hits': hits,
     }
         'request': context.get('request'),
         'hits': hits,
     }
index 00391f1..052b2f1 100644 (file)
@@ -15,7 +15,7 @@ from catalogue.views import JSONResponse
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
 from search import Search, JVM, SearchResult
 from lucene import StringReader
 from suggest.forms import PublishingSuggestForm
-
+import re
 import enchant
 
 dictionary = enchant.Dict('pl_PL')
 import enchant
 
 dictionary = enchant.Dict('pl_PL')
@@ -140,13 +140,35 @@ def main(request):
         
         text_phrase = SearchResult.aggregate(srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False))
         
         
         text_phrase = SearchResult.aggregate(srch.search_phrase(toks, 'content', fuzzy=fuzzy, tokens_cache=tokens_cache, snippets=True, book=False))
         
-        everywhere = SearchResult.aggregate(srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache), author_title_rest)
+        everywhere = srch.search_everywhere(toks, fuzzy=fuzzy, tokens_cache=tokens_cache)
+
+        def already_found(results):
+            def f(e):
+                for r in results:
+                    if e.book_id == r.book_id:
+                        results.append(e)
+                        return True
+                return False
+            return f
+        f = already_found(author_results + title_results)
+        everywhere = filter(lambda x: not f(x), everywhere)
+
+        author_results = SearchResult.aggregate(author_results)
+        title_results = SearchResult.aggregate(title_results)
+        
+        everywhere = SearchResult.aggregate(everywhere, author_title_rest)
 
         for res in [author_results, title_results, text_phrase, everywhere]:
             res.sort(reverse=True)
 
         for res in [author_results, title_results, text_phrase, everywhere]:
             res.sort(reverse=True)
-
+            for r in res:
+                for h in r.hits:
+                    h['snippets'] = map(lambda s:
+                                        re.subn(r"(^[ \t\n]+|[ \t\n]+$)", u"", 
+                                                re.subn(r"[ \t\n]*\n[ \t\n]*", u"\n", s)[0])[0], h['snippets'])
+                    
         suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
         suggestion = did_you_mean(query, srch.get_tokens(toks, field="SIMPLE"))
-
+        print "dym? %s" % repr(suggestion).encode('utf-8')
+        
         results = author_results + title_results + text_phrase + everywhere
         results.sort(reverse=True)
         
         results = author_results + title_results + text_phrase + everywhere
         results.sort(reverse=True)
         
index f54f5ff..8a91bc3 100755 (executable)
@@ -319,8 +319,10 @@ ul.inline-items li {
 }
 
 .snippets .snippet-text {
 }
 
 .snippets .snippet-text {
+    background: #f7f7f7;
     font-size: 1.2em;
     margin: 1.083em 0em;
     font-size: 1.2em;
     margin: 1.083em 0em;
+    padding: 1em;
 }
 
 .snippets .anchor {
 }
 
 .snippets .anchor {
index 4b92c19..935f332 100644 (file)
@@ -1,5 +1,5 @@
 {% extends "catalogue/book_short.html" %}
 {% extends "catalogue/book_short.html" %}
-{% load i18n %}
+{% load i18n catalogue_tags %}
 
 
 {% block box-class %}search-result{% endblock %}
 
 
 {% block box-class %}search-result{% endblock %}
@@ -8,15 +8,14 @@
 <div class="snippets">
   {% for hit in hits %}
   {% if hit.snippets %}
 <div class="snippets">
   {% for hit in hits %}
   {% if hit.snippets %}
-  <p>In text:</p>
-  <div class="snippet-text"><a href="{% url book_text book.slug %}#f{{hit.section_number}}">{{hit.snippets.0|safe}}</a></div>
+  <div class="snippet-text"><a href="{% url book_text book.slug %}">{{hit.snippets.0|safe}}</a></div>
   {% else %}
   {% if hit.fragment %}
   <div class="snippet-text">
   {% else %}
   {% if hit.fragment %}
   <div class="snippet-text">
-    <p>{% trans "In fragment" %}
-      {% if hit.themes_hit %}{% trans ", for themes:" %}{% for t in hit.themes_hit %}{{t.name}} {% endfor %}{% endif %}
-    </p>
-    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.short_text|safe}}</a>
+    {% if hit.themes_hit %}
+    {% inline_tag_list hit.themes_hit  %}
+    {% endif %}
+    <a href="{{hit.fragment.get_absolute_url}}">{{hit.fragment.text|truncatewords_html:15|safe}}</a>
   </div>
   {% endif %}
   {% endif %}
   </div>
   {% endif %}
   {% endif %}
index 6739645..ed06c94 100644 (file)
@@ -8,7 +8,7 @@
 
 {% block body %}
     {% if did_you_mean %}
 
 {% block body %}
     {% if did_you_mean %}
-      <span class="did_you_mean">{% trans "Dod you mean" %} <a href="{% url search %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a></b>?</span>
+      <span class="did_you_mean">{% trans "Did you mean" %} <a href="{% url search %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
     {% endif %}
     <!-- tu pójdą trafienia w tagi: Autorzy - z description oraz motywy i rodzaje (z book_count) -->
 
     {% endif %}
     <!-- tu pójdą trafienia w tagi: Autorzy - z description oraz motywy i rodzaje (z book_count) -->
 
index 50ad2d3..3b1e85b 100644 (file)
 
     <div class="left-column">
     <div class="normal-text">
 
     <div class="left-column">
     <div class="normal-text">
+      <p>    
+       {% if did_you_mean %}
+       <span class="did_you_mean">{% trans "Did you mean" %} <a href="{% url search %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
+       {% endif %}
+      </p>
         <p>{% trans "Sorry! Search cirteria did not match any resources." %}</p>
 
                <p>{% blocktrans %}Search engine supports following criteria: title, author, theme/topic, epoch, kind and genre.
         <p>{% trans "Sorry! Search cirteria did not match any resources." %}</p>
 
                <p>{% blocktrans %}Search engine supports following criteria: title, author, theme/topic, epoch, kind and genre.