search changes

author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)

committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>

Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
author Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
committer Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
diff --git a/apps/search/custom.py b/apps/search/custom.py

index 86d387e..e6f559b 100644 (file)
--- a/apps/search/custom.py
+++ b/apps/search/custom.py
@@ -6,6 +6,8 @@ import warnings
  from sunburnt import search
  import copy
  from httplib2 import socket
+import re
+
  
  class TermVectorOptions(search.Options):
      def __init__(self, schema, original=None):
@@ -93,7 +95,6 @@ class CustomSolrInterface(sunburnt.SolrInterface):
              self.init_schema()
          except socket.error, e:
              raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
-            
  
      def _analyze(self, **kwargs):
          if not self.readable:
@@ -134,6 +135,25 @@ class CustomSolrInterface(sunburnt.SolrInterface):
          terms = map(lambda n: unicode(n.text), terms)
          return terms
  
+    def expand_margins(self, text, start, end):
+        totlen = len(text)
+
+        def is_boundary(x):
+            ws = re.compile(r"\W", re.UNICODE)
+            return bool(ws.match(x))
+
+        while start > 0:
+            if is_boundary(text[start - 1]):
+                break
+            start -= 1
+
+        while end < totlen:
+            if is_boundary(text[end + 1]):
+                break
+            end += 1
+
+        return (start, end)
+
      def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
          start = None
          end = None
@@ -142,23 +162,28 @@ class CustomSolrInterface(sunburnt.SolrInterface):
                                ((s, e),
                                 (max(0, s - margins), min(totlen, e + margins))),
                                    matches)
+        matches_margins = map(lambda (m, (s, e)):
+                              (m, self.expand_margins(text, s, e)),
+            matches_margins)
+
+            # lets start with first match
          (start, end) = matches_margins[0][1]
-        matches = []
+        matches = [matches_margins[0][0]]
+
          for (m, (s, e)) in matches_margins[1:]:
              if end < s or start > e:
                  continue
              start = min(start, s)
              end = max(end, e)
              matches.append(m)
-            
+
          snip = text[start:end]
          matches.sort(lambda a, b: cmp(b[0], a[0]))
+        print matches
  
          for (s, e) in matches:
              off = - start
              snip = snip[:e + off] + mark[1] + snip[e + off:]
              snip = snip[:s + off] + mark[0] + snip[s + off:]
-            # maybe break on word boundaries
  
          return snip
-
diff --git a/apps/search/index.py b/apps/search/index.py

index 26da062..ea1a6c5 100644 (file)
--- a/apps/search/index.py
+++ b/apps/search/index.py
@@ -548,7 +548,7 @@ class SearchResult(object):
              header_span = header_span is not None and int(header_span) or 1
              fragment = doc.get("fragment_anchor", None)
              snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc['snippets_revision']
+            snippets_rev = doc.get('snippets_revision', None)
  
              hit = (sec + (header_span,), fragment, self._score, {
                  'how_found': how_found,
@@ -561,7 +561,7 @@ class SearchResult(object):
              self._hits.append(hit)
  
      def __unicode__(self):
-        return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
+        return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
              (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
  
      def __str__(self):
@@ -725,68 +725,6 @@ class Search(SolrIndex):
      def __init__(self, default_field="text"):
          super(Search, self).__init__(mode='r')
  
-    # def get_tokens(self, searched, field='text', cached=None):
-    #     """returns tokens analyzed by a proper (for a field) analyzer
-    #     argument can be: StringReader, string/unicode, or tokens. In the last case
-    #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
-    #     """
-    #     if cached is not None and field in cached:
-    #         return cached[field]
-
-    #     if isinstance(searched, str) or isinstance(searched, unicode):
-    #         searched = StringReader(searched)
-    #     elif isinstance(searched, list):
-    #         return searched
-
-    #     searched.reset()
-    #     tokens = self.analyzer.reusableTokenStream(field, searched)
-    #     toks = []
-    #     while tokens.incrementToken():
-    #         cta = tokens.getAttribute(CharTermAttribute.class_)
-    #         toks.append(cta.toString())
-
-    #     if cached is not None:
-    #         cached[field] = toks
-
-    #     return toks
-
-    # @staticmethod
-    # def fuzziness(fuzzy):
-    #     """Helper method to sanitize fuzziness"""
-    #     if not fuzzy:
-    #         return None
-    #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
-    #         return fuzzy
-    #     else:
-    #         return 0.5
-
-    # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
-    #     """
-    #     Return a PhraseQuery with a series of tokens.
-    #     """
-    #     if fuzzy:
-    #         phrase = MultiPhraseQuery()
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
-    #             fuzzterms = []
-
-    #             while True:
-    #                 ft = fuzzterm.term()
-    #                 if ft:
-    #                     fuzzterms.append(ft)
-    #                 if not fuzzterm.next(): break
-    #             if fuzzterms:
-    #                 phrase.add(JArray('object')(fuzzterms, Term))
-    #             else:
-    #                 phrase.add(term)
-    #     else:
-    #         phrase = PhraseQuery()
-    #         phrase.setSlop(slop)
-    #         for t in tokens:
-    #             term = Term(field, t)
-    #             phrase.add(term)
-    #     return phrase
  
      def make_term_query(self, query, field='text', modal=operator.or_):
          """
@@ -828,78 +766,6 @@ class Search(SolrIndex):
          res = query.execute()
          return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
  
-    # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for perfect book matches. Just see if the query matches with some author or title,
-    #     taking hints into account.
-    #     """
-    #     fields_to_search = ['authors', 'title']
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #             max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
-    #     return books
-
-    # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     fields_to_search = ['tags', 'authors', 'title']
-
-    #     only_in = None
-    #     if hint:
-    #         if not hint.should_search_for_book():
-    #             return []
-    #         fields_to_search = hint.just_search_in(fields_to_search)
-    #         only_in = hint.book_filter()
-
-    #     tokens = self.get_tokens(searched, field='SIMPLE')
-
-    #     q = BooleanQuery()
-
-    #     for fld in fields_to_search:
-    #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
-    #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
-
-    #     books = []
-    #     top = self.searcher.search(q,
-    #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
-    #         max_results)
-    #     for found in top.scoreDocs:
-    #         books.append(SearchResult(self, found, how_found="search_book"))
-
-    #     return books
-
-    # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
-    #     """
-    #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
-    #     some part/fragment of the book.
-    #     """
-    #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
-
-    #     flt = None
-    #     if hint:
-    #         flt = hint.part_filter()
-
-    #     books = []
-    #     for q in qrys:
-    #         top = self.searcher.search(q,
-    #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
-    #                                                        flt]),
-    #                                    max_results)
-    #         for found in top.scoreDocs:
-    #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
-
-    #     return books
  
      def search_everywhere(self, searched, query_terms=None):
          """
@@ -1055,33 +921,6 @@ class Search(SolrIndex):
              except catalogue.models.Book.DoesNotExist: pass
          return bks
   
-    # def make_prefix_phrase(self, toks, field):
-    #     q = MultiPhraseQuery()
-    #     for i in range(len(toks)):
-    #         t = Term(field, toks[i])
-    #         if i == len(toks) - 1:
-    #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
-    #             if pterms:
-    #                 q.add(pterms)
-    #             else:
-    #                 q.add(t)
-    #         else:
-    #             q.add(t)
-    #     return q
-
-    # @staticmethod
-    # def term_filter(term, inverse=False):
-    #     only_term = TermsFilter()
-    #     only_term.addTerm(term)
-
-    #     if inverse:
-    #         neg = BooleanFilter()
-    #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
-    #         only_term = neg
-
-    #     return only_term
-
-
  
      @staticmethod
      def apply_filters(query, filters):
@@ -1093,15 +932,3 @@ class Search(SolrIndex):
          for f in filters:
              query = query.query(f)
          return query
-
-    # def filtered_categories(self, tags):
-    #     """
-    #     Return a list of tag categories, present in tags list.
-    #     """
-    #     cats = {}
-    #     for t in tags:
-    #         cats[t.category] = True
-    #     return cats.keys()
-
-    # def hint(self):
-    #     return Hint(self)
diff --git a/apps/search/management/commands/checkindex.py b/apps/search/management/commands/checkindex.py

deleted file mode 100644 (file)

index b910277..0000000
--- a/apps/search/management/commands/checkindex.py
+++ /dev/null
@@ -1,22 +0,0 @@
-
-from django.core.management.base import BaseCommand
-from search import IndexChecker
-
-class Command(BaseCommand):
-    help = 'Check Lucene search index'
-    args = ''
-
-    def handle(self, *args, **opts):
-        checker = IndexChecker()
-        status = checker.check()
-        if status.clean:
-            print "No problems found."
-        else:
-            if status.missingSegments:
-                print "Unable to locate."
-            print "Number of bad segments: %d / %d (max segment name is %d)" % \
-                (status.numBadSegments, status.numSegments, status.maxSegmentName)
-            print "Total lost documents (due to bad segments) %d" % status.totLoseDocCount
-            if not status.validCounter:
-                print "Segment counter is not valid."
-        
diff --git a/apps/search/tests/files/fraszka-do-anusie.xml b/apps/search/tests/files/fraszka-do-anusie.xml

deleted file mode 100755 (executable)

index 3bbda15..0000000
--- a/apps/search/tests/files/fraszka-do-anusie.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:S%C4%99p-Szarzy%C5%84ski/Rytmy/Fraszka_do_Anusie">
-<dc:creator xml:lang="pl">Sęp Szarzyński, Mikołaj</dc:creator>
-<dc:title xml:lang="pl">Fraszka do Anusie</dc:title>
-<dc:contributor.editor xml:lang="pl">Sekuła, Aleksandra</dc:contributor.editor>
-<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor>
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Barok</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN.</dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:identifier.url>
-<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/8759</dc:source.URL>
-<dc:source xml:lang="pl">Szarzyński Sęp, Mikołaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914</dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Mikołaj Sęp Szarzyński zm. 1581</dc:rights>
-<dc:date.pd xml:lang="pl">1581</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-12-29</dc:date>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:audience xml:lang="pl">L</dc:audience>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-  <liryka_l>
-
-<autor_utworu>Mikołaj Sęp Szarzyński</autor_utworu>
-
-<nazwa_utworu>Fraszka do Anusie</nazwa_utworu>
-
-
-
-<strofa><begin id="b1230084410751"/><motyw id="m1230084410751">Kochanek, Łzy, Miłość, Oko, Serce, Wzrok</motyw>Jeśli oczu hamować swoich nie umiały/
-Leśnych krynic boginie, aby nie płakały,/
-Gdy baczyły<pe><slowo_obce>baczyły</slowo_obce> --- tu: zobaczyły, patrzyły na.</pe> przy studni Narcyza pięknego,/
-A on umarł prze miłość oblicza swojego;/
-Jeśli nieśmiertelnym stanom żałość rozkazuje,/
-Gdy niebaczna fortuna co niesłusznie psuje:</strofa>
-
-<strofa>Jakoż ja mam hamować, by na lice moje/
-Z oczu smutnych żałośne nie płynęły zdroje?/
-Jako serce powściągać, aby nie wzdychało/
-I od ciężkiej żałości omdlewać nie miało?<end id="e1230084410751"/></strofa>
-
-</liryka_l>
-</utwor>
diff --git a/apps/search/tests/files/fraszki.xml b/apps/search/tests/files/fraszki.xml

deleted file mode 100755 (executable)

index edb29ab..0000000
--- a/apps/search/tests/files/fraszki.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version='1.0' encoding='utf-8'?>
-<utwor>
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
-<rdf:Description rdf:about="">
-<dc:creator xml:lang="pl">Kochanowski, Jan</dc:creator>
-<dc:title xml:lang="pl">Fraszki</dc:title>
-<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:relation.hasPart>
-
-<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher>
-<dc:subject.period xml:lang="pl">Renesans</dc:subject.period>
-<dc:subject.type xml:lang="pl">Liryka</dc:subject.type>
-<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre>
-
-<dc:description xml:lang="pl"></dc:description>
-<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/lektura/fraszki</dc:identifier.url>
-<dc:source xml:lang="pl"></dc:source>
-<dc:rights xml:lang="pl">Domena publiczna - Jan Kochanowski zm. 1584</dc:rights>
-<dc:date.pd xml:lang="pl">1584</dc:date.pd>
-<dc:format xml:lang="pl">xml</dc:format>
-<dc:type xml:lang="pl">text</dc:type>
-
-<dc:type xml:lang="en">text</dc:type>
-<dc:date xml:lang="pl">2008-11-12</dc:date>
-<dc:language xml:lang="pl">pol</dc:language>
-</rdf:Description>
-</rdf:RDF>
-</utwor>
diff --git a/apps/search/tests/index.py b/apps/search/tests/index.py

index 5155a84..12f6f7d 100644 (file)
--- a/apps/search/tests/index.py
+++ b/apps/search/tests/index.py
@@ -1,12 +1,13 @@
  # -*- coding: utf-8 -*-
  from django.conf import settings
  from django.test.utils import override_settings
-from catalogue.test_utils import WLTestCase
-from lucene import PolishAnalyzer, Version
+from catalogue.test_utils import WLTestCase, get_fixture
  from os import path
  import tempfile
-from catalogue import models
-from search import Search, SearchResult
+from catalogue.models import Book, Tag
+from search import Index, Search, SearchResult
+import catalogue
+import opds
  
  
  @override_settings(
@@ -16,23 +17,30 @@ class BookSearchTests(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
  
-        txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml')
+        index = Index()
+        index.index.delete_all()
+        index.index.commit()
+
          with self.settings(NO_SEARCH_INDEX=False):
-            self.book = models.Book.from_xml_file(txt)
+            self.do_doktora = Book.from_xml_file(
+                get_fixture('do-doktora.xml', opds))
+            self.do_anusie = Book.from_xml_file(
+                get_fixture('fraszka-do-anusie.xml', catalogue))
+
          self.search = Search()
  
      def test_search_perfect_book_author(self):
-        books = self.search.search_perfect_book("sęp szarzyński")
+        books = self.search.search_books(self.search.index.query(authors=u"sęp szarzyński"))
          assert len(books) == 1
          assert books[0].book_id == self.book.id
  
      def test_search_perfect_book_title(self):
-        books = self.search.search_perfect_book("fraszka anusie")
+        books = self.search.search_books(self.search.index.query(u"fraszka anusie"))
          assert len(books) == 1
          assert books[0].book_id == self.book.id
  
      def test_search_perfect_parts(self):
-        books = self.search.search_perfect_parts("Jakoż hamować")
+        books = self.search.search_phrase(u"Jakoż hamować")
          assert len(books) == 2
          for b in books:
              b.book_id == self.book.id
@@ -41,28 +49,7 @@ class BookSearchTests(WLTestCase):
          assert len(a[0].hits) == 1
  
      def test_search_perfect_author_title(self):
-        books = self.search.search_perfect_book("szarzyński anusie")
+        books = self.search.search_books(self.search.index.query(authors=u"szarzyński anusie"))
          assert books == []
  
-        books = self.search.search_book("szarzyński anusie")
-        assert len(books) == 1
-
-        books = self.search.search_book("szarzyński fraszka")
-        assert len(books) == 1
-
-    def test_search_everywhere(self):
-        books = self.search.search_everywhere("szarzyński kochanek")
-        print 'szarzyński kochanek %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("szarzyński narcyz")
-        print 'szarzyński narcyz %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("anusie narcyz")
-        print 'anusie narcyz %s' % [b.hits for b in books]
-
-        # theme content cross
-        books = self.search.search_everywhere("wzrok  boginie")
-        print 'wzrok boginie %s' % [b.hits for b in books]
-
-        books = self.search.search_everywhere("anusie płynęły zdroje")
-        print 'anusie płynęły zdroje %s' % [b.hits for b in books]
+        
diff --git a/apps/search/views.py b/apps/search/views.py

index 72852d0..36dd52c 100644 (file)
--- a/apps/search/views.py
+++ b/apps/search/views.py
@@ -188,14 +188,15 @@ def main(request):
      # ensure books do exists & sort them
      results.sort(reverse=True)
  
-    if len(results) == 1:
-        fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
-        if len(fragment_hits) == 1:
-            #anchor = fragment_hits[0]['fragment']
-            #frag = Fragment.objects.get(anchor=anchor)
-            return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
-        return HttpResponseRedirect(results[0].book.get_absolute_url())
-    elif len(results) == 0:
+    # We don't want to redirect to book text, but rather display result page even with one result.
+    # if len(results) == 1:
+    #     fragment_hits = filter(lambda h: 'fragment' in h, results[0].hits)
+    #     if len(fragment_hits) == 1:
+    #         #anchor = fragment_hits[0]['fragment']
+    #         #frag = Fragment.objects.get(anchor=anchor)
+    #         return HttpResponseRedirect(fragment_hits[0]['fragment'].get_absolute_url())
+    #     return HttpResponseRedirect(results[0].book.get_absolute_url())
+    if len(results) == 0:
          form = PublishingSuggestForm(initial={"books": query + ", "})
          return render_to_response('catalogue/search_no_hits.html',
                                    {'tags': tags,
author	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
committer	Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl>
	Wed, 24 Oct 2012 14:49:54 +0000 (16:49 +0200)
apps/search/custom.py		patch \| blob \| history
apps/search/index.py		patch \| blob \| history
apps/search/management/commands/checkindex.py	[deleted file]	patch \| blob \| history
apps/search/tests/files/fraszka-do-anusie.xml	[deleted file]	patch \| blob \| history
apps/search/tests/files/fraszki.xml	[deleted file]	patch \| blob \| history
apps/search/tests/index.py		patch \| blob \| history
apps/search/views.py		patch \| blob \| history