search changes
authorMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
committerMarcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
apps/search/custom.py
apps/search/index.py
apps/search/management/commands/checkindex.py [deleted file]
apps/search/tests/files/fraszka-do-anusie.xml [deleted file]
apps/search/tests/files/fraszki.xml [deleted file]
apps/search/tests/index.py
apps/search/views.py

index 86d387e..e6f559b 100644 (file)
@@ -6,6 +6,8 @@ import warnings
 from sunburnt import search
 import copy
 from httplib2 import socket
+import re
+
 
 class TermVectorOptions(search.Options):
     def __init__(self, schema, original=None):
@@ -93,7 +95,6 @@ class CustomSolrInterface(sunburnt.SolrInterface):
             self.init_schema()
         except socket.error, e:
             raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
-            
 
     def _analyze(self, **kwargs):
         if not self.readable:
@@ -134,6 +135,25 @@ class CustomSolrInterface(sunburnt.SolrInterface):
         terms = map(lambda n: unicode(n.text), terms)
         return terms
 
+    def expand_margins(self, text, start, end):
+        totlen = len(text)
+
+        def is_boundary(x):
+            ws = re.compile(r"\W", re.UNICODE)
+            return bool(ws.match(x))
+
+        while start > 0:
+            if is_boundary(text[start - 1]):
+                break
+            start -= 1
+
+        while end < totlen:
+            if is_boundary(text[end + 1]):
+                break
+            end += 1
+
+        return (start, end)
+
     def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
         start = None
         end = None
@@ -142,23 +162,28 @@ class CustomSolrInterface(sunburnt.SolrInterface):
                               ((s, e),
                                (max(0, s - margins), min(totlen, e + margins))),
                                   matches)
+        matches_margins = map(lambda (m, (s, e)):
+                              (m, self.expand_margins(text, s, e)),
+            matches_margins)
+
+            # lets start with first match
         (start, end) = matches_margins[0][1]
-        matches = []
+        matches = [matches_margins[0][0]]
+
         for (m, (s, e)) in matches_margins[1:]:
             if end < s or start > e:
                 continue
             start = min(start, s)
             end = max(end, e)
             matches.append(m)
-            
+
         snip = text[start:end]
         matches.sort(lambda a, b: cmp(b[0], a[0]))
+        print matches
 
         for (s, e) in matches:
             off = - start
             snip = snip[:e + off] + mark[1] + snip[e + off:]
             snip = snip[:s + off] + mark[0] + snip[s + off:]
-            # maybe break on word boundaries
 
         return snip
-
index 26da062..ea1a6c5 100644 (file)
@@ -548,7 +548,7 @@ class SearchResult(object):
             header_span = header_span is not None and int(header_span) or 1
             fragment = doc.get("fragment_anchor", None)
             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc['snippets_revision']
+            snippets_rev = doc.get('snippets_revision', None)
 
             hit = (sec + (header_span,), fragment, self._score, {
                 'how_found': how_found,
@@ -561,7 +561,7 @@ class SearchResult(object):
             self._hits.append(hit)
 
     def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
 
     def __str__(self):
@@ -725,68 +725,6 @@ class Search(SolrIndex):
     def __init__(self, default_field="text"):
         super(Search, self).__init__(mode='r')
 
-    # def get_tokens(self, searched, field='text', cached=None):
-    #     """returns tokens analyzed by a proper (for a field) analyzer
-    #     argument can be: StringReader, string/unicode, or tokens. In the last case
-    #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-    #     """
-    #     if cached is not None and field in cached:
-    #         return cached[field]
-
-    #     if isinstance(searched, str) or isinstance(searched, unicode):
-    #         searched = StringReader(searched)
-    #     elif isinstance(searched, list):
-    #         return searched
-
-    #     searched.reset()
-    #     tokens = self.analyzer.reusableTokenStream(field, searched)
-    #     toks = []
-    #     while tokens.incrementToken():
-    #         cta = tokens.getAttribute(CharTermAttribute.class_)
-    #         toks.append(cta.toString())
-
-    #     if cached is not None:
-    #         cached[field] = toks
-
-    #     return toks
-
-    # @staticmethod
-    # def fuzziness(fuzzy):
-    #     """Helper method to sanitize fuzziness"""
-    #     if not fuzzy:
-    #         return None
-    #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-    #         return fuzzy
-    #     else:
-    #         return 0.5
-
-    # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
-    #     """
-    #     Return a PhraseQuery with a series of tokens.
-    #     """
-    #     if fuzzy:
-    #         phrase = MultiPhraseQuery()
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-    #             fuzzterms = []
-
-    #             while True:
-    #                 ft = fuzzterm.term()
-    #                 if ft:
-    #                     fuzzterms.append(ft)
-    #                 if not fuzzterm.next(): break
-    #             if fuzzterms:
-    #                 phrase.add(JArray('object')(fuzzterms, Term))
-    #             else:
-    #                 phrase.add(term)
-    #     else:
-    #         phrase = PhraseQuery()
-    #         phrase.setSlop(slop)
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             phrase.add(term)
-    #     return phrase
 
     def make_term_query(self, query, field='text', modal=operator.or_):
         """
@@ -828,78 +766,6 @@ class Search(SolrIndex):
         res = query.execute()
         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
 
-    # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for perfect book matches. Just see if the query matches with some author or title,
-    #     taking hints into account.
-    #     """
-    #     fields_to_search = ['authors', 'title']
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #             max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
-    #     return books
-
-    # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     fields_to_search = ['tags', 'authors', 'title']
-
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     tokens = self.get_tokens(searched, field='SIMPLE')
-
-    #     q = BooleanQuery()
-
-    #     for fld in fields_to_search:
-    #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-    #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-    #     books = []
-    #     top = self.searcher.search(q,
-    #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #         max_results)
-    #     for found in top.scoreDocs:
-    #         books.append(SearchResult(self, found, how_found="search_book"))
-
-    #     return books
-
-    # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-    #     some part/fragment of the book.
-    #     """
-    #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
-    #     flt = None
-    #     if hint:
-    #         flt = hint.part_filter()
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-    #                                                        flt]),
-    #                                    max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-    #     return books
 
     def search_everywhere(self, searched, query_terms=None):
         """
@@ -1055,33 +921,6 @@ class Search(SolrIndex):
             except catalogue.models.Book.DoesNotExist: pass
         return bks
  
-    # def make_prefix_phrase(self, toks, field):
-    #     q = MultiPhraseQuery()
-    #     for i in range(len(toks)):
-    #         t = Term(field, toks[i])
-    #         if i == len(toks) - 1:
-    #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-    #             if pterms:
-    #                 q.add(pterms)
-    #             else:
-    #                 q.add(t)
-    #         else:
-    #             q.add(t)
-    #     return q
-
-    # @staticmethod
-    # def term_filter(term, inverse=False):
-    #     only_term = TermsFilter()
-    #     only_term.addTerm(term)
-
-    #     if inverse:
-    #         neg = BooleanFilter()
-    #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-    #         only_term = neg
-
-    #     return only_term
-
-
 
     @staticmethod
     def apply_filters(query, filters):
@@ -1093,15 +932,3 @@ class Search(SolrIndex):
         for f in filters:
             query = query.query(f)
         return query
-
-    # def filtered_categories(self, tags):
-    #     """
-    #     Return a list of tag categories, present in tags list.
-    #     """
-    #     cats = {}
-    #     for t in tags:
-    #         cats[t.category] = True
-    #     return cats.keys()
-
-    # def hint(self):
-    #     return Hint(self)
diff --git a/apps/search/management/commands/checkindex.py b/apps/search/management/commands/checkindex.py
deleted file mode 100644 (file)
index b910277..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-
-from django.core.management.base import BaseCommand
-from search import IndexChecker
-
-class Command(BaseCommand):
-    help = 'Check Lucene search index'
-    args = ''
-
-    def handle(self, *args, **opts):
-        checker = IndexChecker()
-        status = checker.check()
-        if status.clean:
-            print "No problems found."
-        else:
-            if status.missingSegments:
-                print "Unable to locate."
-            print "Number of bad segments: %d / %d (max segment name is %d)" % \
-                (status.numBadSegments, status.numSegments, status.maxSegmentName)
-            print "Total lost documents (due to bad segments) %d" % status.totLoseDocCount
-            if not status.validCounter:
-                print "Segment counter is not valid."
-        
diff --git a/apps/search/tests/files/fraszka-do-anusie.xml b/apps/search/tests/files/fraszka-do-anusie.xml
deleted file mode 100755 (executable)
index 3bbda15..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:S%C4%99p-Szarzy%C5%84ski/Rytmy/Fraszka_do_Anusie">
-<dc:creator xml:lang="pl">Sęp Szarzyński, Mikołaj</dc:creator>
-<dc:title xml:lang="pl">Fraszka do Anusie</dc:title>
-<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
-<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Barok</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:identifier.url>
-<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/8759</dc:source.URL>
-<dc:source xml:lang="pl">Szarzyński Sęp, Mikołaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914</dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Mikołaj Sęp Szarzyński zm. 1581</dc:rights>
-<dc:date.pd xml:lang="pl">1581</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-12-29</dc:date>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-  <liryka_l>
-
-<autor_utworu>Mikołaj Sęp Szarzyński</autor_utworu>
-
-<nazwa_utworu>Fraszka do Anusie</nazwa_utworu>
-
-
-
-<strofa><begin id="b1230084410751"/><motyw id="m1230084410751">Kochanek, Łzy, Miłość, Oko, Serce, Wzrok</motyw>Jeśli oczu hamować swoich nie umiały/
-Leśnych krynic boginie, aby nie płakały,/
-Gdy baczyły<pe><slowo_obce>baczyły</slowo_obce> --- tu: zobaczyły, patrzyły na.</pe> przy studni Narcyza pięknego,/
-A on umarł prze miłość oblicza swojego;/
-Jeśli nieśmiertelnym stanom żałość rozkazuje,/
-Gdy niebaczna fortuna co niesłusznie psuje:</strofa>
-
-<strofa>Jakoż ja mam hamować, by na lice moje/
-Z oczu smutnych żałośne nie płynęły zdroje?/
-Jako serce powściągać, aby nie wzdychało/
-I od ciężkiej żałości omdlewać nie miało?<end id="e1230084410751"/></strofa>
-
-</liryka_l>
-</utwor>
diff --git a/apps/search/tests/files/fraszki.xml b/apps/search/tests/files/fraszki.xml
deleted file mode 100755 (executable)
index edb29ab..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="">
-<dc:creator xml:lang="pl">Kochanowski, Jan</dc:creator>
-<dc:title xml:lang="pl">Fraszki</dc:title>
-<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:relation.hasPart>
-
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Renesans</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-
-<dc:description xml:lang="pl"></dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/lektura/fraszki</dc:identifier.url>
-<dc:source xml:lang="pl"></dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Jan Kochanowski zm. 1584</dc:rights>
-<dc:date.pd xml:lang="pl">1584</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-11-12</dc:date>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-</utwor>
index 5155a84..12f6f7d 100644 (file)
@@ -1,12 +1,13 @@
 # -*- coding: utf-8 -*-
 from django.conf import settings
 from django.test.utils import override_settings
-from catalogue.test_utils import WLTestCase
-from lucene import PolishAnalyzer, Version
+from catalogue.test_utils import WLTestCase, get_fixture
 from os import path
 import tempfile
-from catalogue import models
-from search import Search, SearchResult
+from catalogue.models import Book, Tag
+from search import Index, Search, SearchResult
+import catalogue
+import opds
 
 
 @override_settings(
@@ -16,23 +17,30 @@ class BookSearchTests(WLTestCase):
     def setUp(self):
         WLTestCase.setUp(self)
 
-        txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml')
+        index = Index()
+        index.index.delete_all()
+        index.index.commit()
+
         with self.settings(NO_SEARCH_INDEX=False):
-            self.book = models.Book.from_xml_file(txt)
+            self.do_doktora = Book.from_xml_file(
+                get_fixture('do-doktora.xml', opds))
+            self.do_anusie = Book.from_xml_file(
+                get_fixture('fraszka-do-anusie.xml', catalogue))
+
         self.search = Search()
 
     def test_search_perfect_book_author(self):
-        books = self.search.search_perfect_book("sęp szarzyński")
+        books = self.search.search_books(self.search.index.query(authors=u"sęp szarzyński"))
         assert len(books) == 1
         assert books[0].book_id == self.book.id
 
     def test_search_perfect_book_title(self):
-        books = self.search.search_perfect_book("fraszka anusie")
+        books = self.search.search_books(self.search.index.query(u"fraszka anusie"))
         assert len(books) == 1
         assert books[0].book_id == self.book.id
 
     def test_search_perfect_parts(self):
-        books = self.search.search_perfect_parts("Jakoż hamować")
+        books = self.search.search_phrase(u"Jakoż hamować")
         assert len(books) == 2
         for b in books:
             b.book_id == self.book.id
@@ -41,28 +49,7 @@ class BookSearchTests(WLTestCase):
         assert len(a[0].hits) == 1
 
     def test_search_perfect_author_title(self):
-        books = self.search.search_perfect_book("szarzyński anusie")
+        books = self.search.search_books(self.search.index.query(authors=u"szarzyński anusie"))
         assert books == []
 
-        books = self.search.search_book("szarzyński anusie")
-        assert len(books) == 1
-
-        books = self.search.search_book("szarzyński fraszka")
-        assert len(books) == 1
-
-    def test_search_everywhere(self):
-        books = self.search.search_everywhere("szarzyński kochanek")
-        print 'szarzyński kochanek %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("szarzyński narcyz")
-        print 'szarzyński narcyz %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("anusie narcyz")
-        print 'anusie narcyz %s' % [b.hits for b in books]
-
-        # theme content cross
-        books = self.search.search_everywhere("wzrok  boginie")
-        print 'wzrok boginie %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("anusie płynęły zdroje")
-        print 'anusie płynęły zdroje %s' % [b.hits for b in books]
+        
index 72852d0..36dd52c 100644 (file)
@@ -188,14 +188,15 @@ def main(request):
     # ensure books do exists & sort them
     results.sort(reverse=True)
 
-    if len(results) == 1:
-        fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-        if len(fragment_hits) == 1:
-            #anchor = fragment_hits[0]['fragment']
-            #frag = Fragment.objects.get(anchor=anchor)
-            return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-        return HttpResponseRedirect(results[0].book.get_absolute_url())
-    elif len(results) == 0:
+    # We don't want to redirect to book text, but rather display result page even with one result.
+    # if len(results) == 1:
+    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+    #     if len(fragment_hits) == 1:
+    #         #anchor = fragment_hits[0]['fragment']
+    #         #frag = Fragment.objects.get(anchor=anchor)
+    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
+    if len(results) == 0:
         form = PublishingSuggestForm(initial={"books": query + ", "})
         return render_to_response('catalogue/search_no_hits.html',
                                   {'tags': tags,