Remove legacy search.

author Radek Czajka <rczajka@rczajka.pl>

Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)

committer Radek Czajka <rczajka@rczajka.pl>

Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
author Radek Czajka <rczajka@rczajka.pl>
Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
committer Radek Czajka <rczajka@rczajka.pl>
Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
diff --git a/.gitignore b/.gitignore

index 99c0898..3d746f9 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ coverage.xml
  pip-log.txt
  nosetests.xml
  /htmlcov
  pip-log.txt
  nosetests.xml
  /htmlcov
+.python-version
  
  # Mac OS X garbage
  .DS_Store
  
  # Mac OS X garbage
  .DS_Store
diff --git a/requirements/requirements.txt b/requirements/requirements.txt

index 99e11fa..86afdfc 100644 (file)
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -49,9 +49,6 @@ celery[redis]==5.2.7
  #pyoai==2.5.1
  -e git+https://github.com/infrae/pyoai@5ff2f15e869869e70d8139e4c37b7832854d7049#egg=pyoai
  
  #pyoai==2.5.1
  -e git+https://github.com/infrae/pyoai@5ff2f15e869869e70d8139e4c37b7832854d7049#egg=pyoai
  
-scorched==0.13
-httplib2
-
  sentry-sdk==0.10.2
  
  requests
  sentry-sdk==0.10.2
  
  requests
diff --git a/src/api/tests/res/responses/collection.json b/src/api/tests/res/responses/collection.json

index 2992461..0e385dc 100644 (file)
--- a/src/api/tests/res/responses/collection.json
+++ b/src/api/tests/res/responses/collection.json
@@ -1,5 +1,6 @@
  {
      "url": "http://testserver/katalog/lektury/a-collection/",
  {
      "url": "http://testserver/katalog/lektury/a-collection/",
+    "authors": [],
      "books": [
          {
              "kind": "Liryka", 
      "books": [
          {
              "kind": "Liryka", 
diff --git a/src/catalogue/management/commands/importbooks.py b/src/catalogue/management/commands/importbooks.py

index e9b3364..9322eea 100644 (file)
--- a/src/catalogue/management/commands/importbooks.py
+++ b/src/catalogue/management/commands/importbooks.py
@@ -12,7 +12,6 @@ from librarian.picture import ImageStore
  
  from catalogue.models import Book
  from picture.models import Picture
  
  from catalogue.models import Book
  from picture.models import Picture
-from search.index import Index
  
  
  class Command(BaseCommand):
  
  
  class Command(BaseCommand):
@@ -28,10 +27,6 @@ class Command(BaseCommand):
          parser.add_argument(
                  '-D', '--dont-build', dest='dont_build', metavar="FORMAT,...",
                  help="Skip building specified formats")
          parser.add_argument(
                  '-D', '--dont-build', dest='dont_build', metavar="FORMAT,...",
                  help="Skip building specified formats")
-        parser.add_argument(
-                '-S', '--no-search-index', action='store_false',
-                dest='search_index', default=True,
-                help='Skip indexing imported works for search')
          parser.add_argument(
                  '-F', '--not-findable', action='store_false',
                  dest='findable', default=True,
          parser.add_argument(
                  '-F', '--not-findable', action='store_false',
                  dest='findable', default=True,
@@ -50,7 +45,6 @@ class Command(BaseCommand):
          file_base, ext = os.path.splitext(file_path)
          book = Book.from_xml_file(file_path, overwrite=options.get('force'),
                                    dont_build=dont_build,
          file_base, ext = os.path.splitext(file_path)
          book = Book.from_xml_file(file_path, overwrite=options.get('force'),
                                    dont_build=dont_build,
-                                  search_index_tags=False,
                                    findable=options.get('findable'),
                                    remote_gallery_url='file://' + os.path.dirname(os.path.abspath(file_base)) + '/img/'
                                    )
                                    findable=options.get('findable'),
                                    remote_gallery_url='file://' + os.path.dirname(os.path.abspath(file_base)) + '/img/'
                                    )
@@ -84,15 +78,6 @@ class Command(BaseCommand):
          verbose = options.get('verbose')
          import_picture = options.get('import_picture')
  
          verbose = options.get('verbose')
          import_picture = options.get('import_picture')
  
-        if options.get('search_index') and not settings.NO_SEARCH_INDEX:
-            index = Index()
-            try:
-                index.index_tags()
-                index.index.commit()
-            except Exception as e:
-                index.index.rollback()
-                raise e
-
          files_imported = 0
          files_skipped = 0
  
          files_imported = 0
          files_skipped = 0
  
diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py

index 85cfd63..bcbefea 100644 (file)
--- a/src/catalogue/models/book.py
+++ b/src/catalogue/models/book.py
@@ -529,21 +529,11 @@ class Book(models.Model):
          })
          return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme})
  
          })
          return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme})
  
-    def search_index(self, book_info=None, index=None, index_tags=True, commit=True):
+    def search_index(self, index=None):
          if not self.findable:
              return
          if not self.findable:
              return
-        if index is None:
-            from search.index import Index
-            index = Index()
-        try:
-            index.index_book(self, book_info)
-            if index_tags:
-                index.index_tags()
-            if commit:
-                index.index.commit()
-        except Exception as e:
-            index.index.rollback()
-            raise e
+        from search.index import Index
+        Index.index_book(self)
  
      # will make problems in conjunction with paid previews
      def download_pictures(self, remote_gallery_url):
  
      # will make problems in conjunction with paid previews
      def download_pictures(self, remote_gallery_url):
@@ -603,7 +593,7 @@ class Book(models.Model):
  
      @classmethod
      def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
  
      @classmethod
      def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
-                           search_index_tags=True, remote_gallery_url=None, days=0, findable=True):
+                           remote_gallery_url=None, days=0, findable=True):
          from catalogue import tasks
  
          if dont_build is None:
          from catalogue import tasks
  
          if dont_build is None:
@@ -712,7 +702,7 @@ class Book(models.Model):
                  getattr(book, '%s_file' % format_).build_delay()
  
          if not settings.NO_SEARCH_INDEX and search_index and findable:
                  getattr(book, '%s_file' % format_).build_delay()
  
          if not settings.NO_SEARCH_INDEX and search_index and findable:
-            tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags)
+            tasks.index_book.delay(book.id)
  
          for child in notify_cover_changed:
              child.parent_cover_changed()
  
          for child in notify_cover_changed:
              child.parent_cover_changed()
diff --git a/src/catalogue/signals.py b/src/catalogue/signals.py

index 72f8a89..81c0b9c 100644 (file)
--- a/src/catalogue/signals.py
+++ b/src/catalogue/signals.py
@@ -53,13 +53,6 @@ def book_save(sender, instance, **kwargs):
  def book_delete(sender, instance, **kwargs):
      caches[settings.CACHE_MIDDLEWARE_ALIAS].clear()
  
  def book_delete(sender, instance, **kwargs):
      caches[settings.CACHE_MIDDLEWARE_ALIAS].clear()
  
-    if not settings.NO_SEARCH_INDEX:
-        # remove the book from search index, when it is deleted.
-        from search.index import Index
-        idx = Index()
-        idx.remove_book(instance)
-        idx.index_tags()
-
  
  ####
  # Tag
  
  ####
  # Tag
diff --git a/src/catalogue/tasks.py b/src/catalogue/tasks.py

index b2308bb..0694b01 100644 (file)
--- a/src/catalogue/tasks.py
+++ b/src/catalogue/tasks.py
@@ -32,9 +32,9 @@ def build_field(pk, field_name):
  
  
  @shared_task
  
  
  @shared_task
-def index_book(book_id, book_info=None, **kwargs):
+def index_book(book_id, **kwargs):
      try:
      try:
-        return Book.objects.get(id=book_id).search_index(book_info, **kwargs)
+        return Book.objects.get(id=book_id).search_index(**kwargs)
      except Exception as e:
          print("Exception during index: %s" % e)
          print_exc()
      except Exception as e:
          print("Exception during index: %s" % e)
          print_exc()
diff --git a/src/catalogue/templates/catalogue/search_multiple_hits.html b/src/catalogue/templates/catalogue/search_multiple_hits.html

deleted file mode 100644 (file)

index 937b926..0000000
--- a/src/catalogue/templates/catalogue/search_multiple_hits.html
+++ /dev/null
@@ -1,130 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load pagination_tags %}
-{% load inline_tag_list from catalogue_tags %}
-{% load book_searched from search_tags %}
-{% load set_get_parameter %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-
-  <h1>{% trans "Search" %}</h1>
-
-  <div class="white-box">
-
-    <p class="search-filter">
-      <strong>format:</strong>
-      {% if not set.format %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter format='' %}">dowolny</a>{% endif %}
-      {% if set.format == "tekst" %}<em>tekst</em>{% else %}<a href="{% set_get_parameter format='tekst' %}">tekst</a>{% endif %}
-      {% if set.format == "audio" %}<em>audiobook</em>{% else %}<a href="{% set_get_parameter format='audio' %}">audiobook</a>{% endif %}
-      {% if set.format == "synchro" %}<em>DAISY</em>{% else %}<a href="{% set_get_parameter format='synchro' %}">DAISY</a>{% endif %}
-      {% if set.format == "obraz" %}<em>obraz</em>{% else %}<a href="{% set_get_parameter format='obraz' %}">obraz</a>{% endif %}
-    </p>
-
-    <p class="search-filter">
-      <strong>{% trans "language" %}: </strong>
-      {% if not set.lang %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter lang='' %}">dowolny</a>{% endif %}
-      {% if set.lang == "pol" %}<em>polski</em>{% else %}<a href="{% set_get_parameter lang='pol' %}">polski</a>{% endif %}
-      {% if set.lang == "eng" %}<em>angielski</em>{% else %}<a href="{% set_get_parameter lang='eng' %}">angielski</a>{% endif %}
-      {% if set.lang == "fre" %}<em>francuski</em>{% else %}<a href="{% set_get_parameter lang='fre' %}">francuski</a>{% endif %}
-      {% if set.lang == "ger" %}<em>niemiecki</em>{% else %}<a href="{% set_get_parameter lang='ger' %}">niemiecki</a>{% endif %}
-      {% if set.lang == "lit" %}<em>litewski</em>{% else %}<a href="{% set_get_parameter lang='lit' %}">litewski</a>{% endif %}
-    </p>
-
-    </p>
-    <p class="search-filter">
-      <strong>{% trans "epoch" %}: </strong>
-      {% if not set.epoch %}<em>dowolna</em>{% else %}<a href="{% set_get_parameter epoch='' %}">dowolna</a>{% endif %}
-
-      {% for tag in tags.epoch %}
-        {% if set.epoch == tag.slug %}
-          <em>{{ tag.name }}</em>
-        {% else %}
-          <a href="{% set_get_parameter epoch=tag.slug %}">
-            {{ tag.name }}
-          </a>
-        {% endif %}
-      {% endfor %}
-    </p>
-    <p class="search-filter">
-      <strong>{% trans "kind" %}: </strong>
-      {% if not set.kind %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter kind='' %}">dowolny</a>{% endif %}
-      {% for tag in tags.kind %}
-        {% if set.kind == tag.slug %}
-          <em>{{ tag.name }}</em>
-        {% else %}
-          <a href="{% set_get_parameter kind=tag.slug %}">
-            {{ tag.name }}
-          </a>
-        {% endif %}
-      {% endfor %}
-    </p>
-
-    {% comment %}
-    <p class="search-filter">
-      <strong>{% trans "genre" %}: </strong>
-      {% if not set.genre %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter genre='' %}">dowolny</a>{% endif %}
-      {% for tag in tags.genre %}
-          {% if set.genre == tag.slug %}
-            <em>{{ tag.name }}</em>
-          {% else %}
-            <a href="{% set_get_parameter genre=tag.slug %}">
-              {{ tag.name }}
-            </a>
-          {% endif %}
-        {% endfor %}
-    </p>
-    {% endcomment %}
-  </div>
-
-  {% if did_you_mean %}
-    <span class="did_you_mean">{% trans "Did you mean" %}
-      <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
-  {% endif %}
-  <div class="top-tag-list">
-    {% if pd_authors %}
-      <div>
-        {% for author in pd_authors %}
-          <div class="tag-box">
-            {% include "pdcounter/pd_author_box.html" %}
-          </div>
-        {% endfor %}
-      </div>
-    {% endif %}
-  </div>
-
-  <div>
-    {% if books %}
-      <ul class="work-list">
-        {% if pictures %}
-          <h1>{% trans "Books" %}</h1>
-        {% endif %}
-        {% for result in books %}
-          <li class="Book-item">
-            <div class="search-result">
-              {% book_searched result %}
-            </div>
-          </li>
-        {% endfor %}
-      </ul>
-    {% endif %}
-
-    {% if pictures %}
-      <h1>{% trans "Art" %}</h1>
-      <ul class="work-list">
-        {% for result in pictures %}
-          <li class="Picture-item">
-            <div class="search-result">
-              {% with result.picture as picture %}
-                {% include "picture/picture_searched.html" %}
-              {% endwith %}
-            </div>
-          </li>
-        {% endfor %}
-      </ul>
-    {% endif %}
-  </div>
-{% endblock %}
diff --git a/src/catalogue/templates/catalogue/search_no_hits.html b/src/catalogue/templates/catalogue/search_no_hits.html

deleted file mode 100644 (file)

index 3f9e982..0000000
--- a/src/catalogue/templates/catalogue/search_no_hits.html
+++ /dev/null
@@ -1,29 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div class="left-column">
-    <div class="normal-text">
-      <p>
-        {% if did_you_mean %}
-          <span class="did_you_mean">{% trans "Did you mean" %}
-            <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
-        {% endif %}
-      </p>
-      <p>{% trans "Sorry! Search cirteria did not match any resources." %}</p>
-
-      {% include "info/join_us.html" %}
-    </div>
-  </div>
-
-  <div class="right-column">
-    {% include "publishing_suggest.html" %}
-  </div>
-{% endblock %}
diff --git a/src/catalogue/templates/catalogue/search_too_long.html b/src/catalogue/templates/catalogue/search_too_long.html

deleted file mode 100644 (file)

index 4f780df..0000000
--- a/src/catalogue/templates/catalogue/search_too_long.html
+++ /dev/null
@@ -1,16 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div id="books-list">
-    <p>{% trans "Sorry! Search query is too long to be processed." %}</p>
-    {% include "info/join_us.html" %}
-  </div>
-{% endblock %}
-\ No newline at end of file
diff --git a/src/catalogue/templates/catalogue/search_too_short.html b/src/catalogue/templates/catalogue/search_too_short.html

deleted file mode 100644 (file)

index 253a94b..0000000
--- a/src/catalogue/templates/catalogue/search_too_short.html
+++ /dev/null
@@ -1,16 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div id="books-list">
-    <p>{% trans "Sorry! Search query must have at least two characters." %}</p>
-    {% include "info/join_us.html" %}
-  </div>
-{% endblock %}
-\ No newline at end of file
diff --git a/src/catalogue/test_utils.py b/src/catalogue/test_utils.py

index 6bc5569..c15cba7 100644 (file)
--- a/src/catalogue/test_utils.py
+++ b/src/catalogue/test_utils.py
@@ -19,7 +19,6 @@ from librarian import WLURI
      CACHES={
          'default': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
      },
      CACHES={
          'default': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
      },
-    SOLR=settings.SOLR_TEST,
  )
  class WLTestCase(TestCase):
      """
  )
  class WLTestCase(TestCase):
      """
@@ -74,7 +73,7 @@ def info_args(title, language=None):
          language = 'pol'
      return {
          'title': str(title),
          language = 'pol'
      return {
          'title': str(title),
-        'url': WLURI.from_slug(slug),
+        'url': WLURI(slug),
          'about': "http://wolnelektury.pl/example/URI/%s" % slug,
          'language': language,
      }
          'about': "http://wolnelektury.pl/example/URI/%s" % slug,
          'language': language,
      }
diff --git a/src/catalogue/tests/test_book_import.py b/src/catalogue/tests/test_book_import.py

index f8900c8..5f9627f 100644 (file)
--- a/src/catalogue/tests/test_book_import.py
+++ b/src/catalogue/tests/test_book_import.py
@@ -14,7 +14,7 @@ class BookImportLogicTests(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
          self.book_info = BookInfoStub(
      def setUp(self):
          WLTestCase.setUp(self)
          self.book_info = BookInfoStub(
-            url=WLURI.from_slug("default-book"),
+            url=WLURI("default-book"),
              about="http://wolnelektury.pl/example/URI/default_book",
              title="Default Book",
              author=PersonStub(("Jim",), "Lazy"),
              about="http://wolnelektury.pl/example/URI/default_book",
              title="Default Book",
              author=PersonStub(("Jim",), "Lazy"),
@@ -121,7 +121,7 @@ class BookImportLogicTests(WLTestCase):
  
      def test_book_with_invalid_slug(self):
          """ Book with invalid characters in slug shouldn't be imported """
  
      def test_book_with_invalid_slug(self):
          """ Book with invalid characters in slug shouldn't be imported """
-        self.book_info.url = WLURI.from_slug("default_book")
+        self.book_info.url = WLURI("default_book")
          book_text = "<utwor />"
          with self.assertRaises(ValueError):
              models.Book.from_text_and_meta(ContentFile(book_text), self.book_info)
          book_text = "<utwor />"
          with self.assertRaises(ValueError):
              models.Book.from_text_and_meta(ContentFile(book_text), self.book_info)
@@ -375,7 +375,7 @@ class TreeImportTest(WLTestCase):
  class MultilingualBookImportTest(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
  class MultilingualBookImportTest(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
-        common_uri = WLURI.from_slug('common-slug')
+        common_uri = WLURI('common-slug')
  
          self.pol_info = BookInfoStub(
              genre='X-Genre',
  
          self.pol_info = BookInfoStub(
              genre='X-Genre',
diff --git a/src/catalogue/tests/test_bookmedia.py b/src/catalogue/tests/test_bookmedia.py

index edd17a5..7952826 100644 (file)
--- a/src/catalogue/tests/test_bookmedia.py
+++ b/src/catalogue/tests/test_bookmedia.py
@@ -4,7 +4,7 @@
  from os.path import basename, exists
  from unittest import skip
  
  from os.path import basename, exists
  from unittest import skip
  
-from django.core.files.base import ContentFile
+from django.core.files.base import ContentFile, File
  
  from catalogue.test_utils import *
  from catalogue import models, utils
  
  from catalogue.test_utils import *
  from catalogue import models, utils
@@ -17,6 +17,8 @@ class BookMediaTests(WLTestCase):
          self.file = ContentFile(b'X')
          self.file2 = ContentFile(b'Y')
          self.book = models.Book.objects.create(slug='test-book', title='Test')
          self.file = ContentFile(b'X')
          self.file2 = ContentFile(b'Y')
          self.book = models.Book.objects.create(slug='test-book', title='Test')
+        with open(join(dirname(__file__), "files/fraszka-do-anusie.xml")) as f:
+            self.book.xml_file.save(None, File(f))
  
      def set_title(self, title):
          self.book.title = title
  
      def set_title(self, title):
          self.book.title = title
diff --git a/src/catalogue/tests/test_tags.py b/src/catalogue/tests/test_tags.py

index a706618..0853a42 100644 (file)
--- a/src/catalogue/tests/test_tags.py
+++ b/src/catalogue/tests/test_tags.py
@@ -107,73 +107,88 @@ class TagRelatedTagsTests(WLTestCase):
      def test_empty(self):
          """ empty tag should have no related tags """
  
      def test_empty(self):
          """ empty tag should have no related tags """
  
-        cats = self.client.get('/katalog/autor/empty/').context['categories']
-        self.assertEqual({k: v for (k, v) in cats.items() if v}, {}, 'tags related to empty tag')
+        suggested = self.client.get('/katalog/autor/empty/').context['suggested_tags']
+        self.assertEqual(suggested, [], 'tags related to empty tag')
  
      def test_has_related(self):
          """ related own and descendants' tags should be generated """
  
  
      def test_has_related(self):
          """ related own and descendants' tags should be generated """
  
-        cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
-        self.assertTrue('Common Man' in [tag.name for tag in cats['author']],
+        suggested = {
+            (t.name, t.category)
+            for t in self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+        }
+        self.assertTrue(('Common Man', 'author') in suggested,
                          'missing `author` related tag')
                          'missing `author` related tag')
-        self.assertTrue('Epoch' in [tag.name for tag in cats['epoch']],
+        self.assertTrue(('Epoch', 'epoch') in suggested,
                          'missing `epoch` related tag')
                          'missing `epoch` related tag')
-        self.assertFalse(cats.get("kind", False),
+        # TODO: this should probably be changed now.
+        self.assertFalse(any(x for x in suggested if x[1] == "kind"),
                           "There should be no child-only related `kind` tags")
                           "There should be no child-only related `kind` tags")
-        self.assertTrue("Genre" in [tag.name for tag in cats['genre']],
+        self.assertTrue(("Genre", 'genre') in suggested,
                          'missing `genre` related tag')
                          'missing `genre` related tag')
-        self.assertFalse("ChildGenre" in [tag.name for tag in cats['genre']],
+        # TODO: this should probably be changed now.
+        self.assertFalse(("ChildGenre", 'genre') in suggested,
                           "There should be no child-only related `genre` tags")
                           "There should be no child-only related `genre` tags")
-        self.assertTrue("GchildGenre" in [tag.name for tag in cats['genre']],
+        self.assertTrue(("GchildGenre", "genre") in suggested,
                          "missing grandchild's related tag")
                          "missing grandchild's related tag")
-        self.assertTrue('Theme' in [tag.name for tag in cats['theme']],
+        self.assertTrue(('Theme', 'theme') in suggested,
                          "missing related theme")
                          "missing related theme")
-        self.assertFalse('Child1Theme' in [tag.name for tag in cats['theme']],
-                         "There should be no child-only related `theme` tags")
-        self.assertTrue('GChildTheme' in [tag.name for tag in cats['theme']],
+        self.assertTrue(('Child1Theme', 'theme') in suggested,
+                         "missing child's related theme")
+        self.assertTrue(('GChildTheme', 'theme') in suggested,
                          "missing grandchild's related theme")
  
      def test_related_differ(self):
          """ related tags shouldn't include filtering tags """
  
          response = self.client.get('/katalog/rodzaj/kind/')
                          "missing grandchild's related theme")
  
      def test_related_differ(self):
          """ related tags shouldn't include filtering tags """
  
          response = self.client.get('/katalog/rodzaj/kind/')
-        cats = response.context['categories']
-        self.assertFalse(cats.get('kind', False),
+        suggested = response.context['suggested_tags']
+        self.assertFalse(any(x for x in suggested if x.category == 'kind'),
                           'filtering tag wrongly included in related')
                           'filtering tag wrongly included in related')
-        cats = self.client.get('/katalog/motyw/theme/').context['categories']
-        self.assertFalse('Theme' in [tag.name for tag in cats['theme']],
+        suggested = {
+            (t.name, t.category)
+            for t in self.client.get(
+                    '/katalog/motyw/theme/').context['suggested_tags']
+        }
+        self.assertFalse(('Theme', 'theme') in suggested,
                           'filtering theme wrongly included in related')
  
      def test_parent_tag_once(self):
          """ if parent and descendants have a common tag, count it only once """
  
                           'filtering theme wrongly included in related')
  
      def test_parent_tag_once(self):
          """ if parent and descendants have a common tag, count it only once """
  
-        cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
-        self.assertEqual([(tag.name, tag.count) for tag in cats['epoch']],
+        suggested = self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+        self.assertEqual([(tag.name, tag.count) for tag in suggested if tag.category == 'epoch'],
                           [('Epoch', 1)],
                           'wrong related tag epoch tag on tag page')
  
      def test_siblings_tags_count(self):
          """ if children have tags and parent hasn't, count the children """
  
                           [('Epoch', 1)],
                           'wrong related tag epoch tag on tag page')
  
      def test_siblings_tags_count(self):
          """ if children have tags and parent hasn't, count the children """
  
-        cats = self.client.get('/katalog/epoka/epoch/').context['categories']
+        suggested = self.client.get('/katalog/epoka/epoch/').context['suggested_tags']
+        kinds = [(tag.name, tag.count) for tag in suggested if tag.category == 'kind']
          self.assertTrue(
          self.assertTrue(
-            ('ChildKind', 2) in [(tag.name, tag.count) for tag in cats['kind']],
-            'wrong related kind tags on tag page, got: ' +
-            str([(tag.name, tag.count) for tag in cats['kind']]))
+            ('ChildKind', 2) in kinds,
+            'wrong related kind tags on tag page'
+        )
  
          # all occurencies of theme should be counted
  
          # all occurencies of theme should be counted
-        self.assertTrue(('Theme', 4) in [(tag.name, tag.count) for tag in cats['theme']],
-                        'wrong related theme count')
+        themes = [(tag.name, tag.count) for tag in suggested if tag.category == 'theme']
+        self.assertTrue(
+            ('Theme', 4) in themes,
+            'wrong related theme count'
+        )
  
      def test_query_child_tag(self):
          """
          If child and parent have a common tag, but parent isn't included
          in the result, child should still count.
          """
  
      def test_query_child_tag(self):
          """
          If child and parent have a common tag, but parent isn't included
          in the result, child should still count.
          """
-        cats = self.client.get('/katalog/gatunek/childgenre/').context['categories']
-        self.assertTrue(('Epoch', 2) in [(tag.name, tag.count) for tag in cats['epoch']],
-                        'wrong related kind tags on tag page, got: ' +
-                        str([(tag.name, tag.count) for tag in cats['epoch']]))
+        suggested = self.client.get('/katalog/gatunek/childgenre/').context['suggested_tags']
+        epochs = [(tag.name, tag.count) for tag in suggested if tag.category == 'epoch']
+        self.assertTrue(
+            ('Epoch', 2) in epochs,
+            'wrong related kind tags on tag page'
+        )
  
  
  class CleanTagRelationTests(WLTestCase):
  
  
  class CleanTagRelationTests(WLTestCase):
@@ -198,8 +213,8 @@ class CleanTagRelationTests(WLTestCase):
          """ there should be no related tags left after deleting some objects """
  
          models.Book.objects.all().delete()
          """ there should be no related tags left after deleting some objects """
  
          models.Book.objects.all().delete()
-        cats = self.client.get('/katalog/rodzaj/k/').context['categories']
-        self.assertEqual({k: v for (k, v) in cats.items() if v}, {})
+        suggested = self.client.get('/katalog/rodzaj/k/').context['suggested_tags']
+        self.assertEqual(suggested, [])
          self.assertEqual(models.Fragment.objects.all().count(), 0,
                           "orphaned fragments left")
          self.assertEqual(models.Tag.intermediary_table_model.objects.all().count(), 0,
          self.assertEqual(models.Fragment.objects.all().count(), 0,
                           "orphaned fragments left")
          self.assertEqual(models.Tag.intermediary_table_model.objects.all().count(), 0,
@@ -248,10 +263,11 @@ class TestIdenticalTag(WLTestCase):
                  self.book_info)
          categories = {'author': 'autor', 'theme': 'motyw', 'epoch': 'epoka', 'kind': 'rodzaj', 'genre': 'gatunek'}
          for cat, localcat in categories.items():
                  self.book_info)
          categories = {'author': 'autor', 'theme': 'motyw', 'epoch': 'epoka', 'kind': 'rodzaj', 'genre': 'gatunek'}
          for cat, localcat in categories.items():
+            if cat == 'theme': continue
              context = self.client.get('/katalog/%s/tag/' % localcat).context
              self.assertEqual(1, len(context['object_list']))
              context = self.client.get('/katalog/%s/tag/' % localcat).context
              self.assertEqual(1, len(context['object_list']))
-            self.assertNotEqual({}, context['categories'])
-            self.assertFalse(context['categories'].get(cat, False))
+            self.assertNotEqual([], context['suggested_tags'])
+            self.assertFalse(any(t for t in context['suggested_tags'] if t.category == cat))
  
  
  class BookTagsTests(WLTestCase):
  
  
  class BookTagsTests(WLTestCase):
diff --git a/src/club/forms.py b/src/club/forms.py

index 5c2d301..098a70e 100644 (file)
--- a/src/club/forms.py
+++ b/src/club/forms.py
@@ -138,7 +138,8 @@ class DonationStep1Form(forms.ModelForm):
      def __init__(self, *args, **kwargs):
          super().__init__(*args, **kwargs)
          club = models.Club.objects.first()
      def __init__(self, *args, **kwargs):
          super().__init__(*args, **kwargs)
          club = models.Club.objects.first()
-        self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
+        if club is not None:
+            self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
  
      def clean(self):
          state = {}
  
      def clean(self):
          state = {}
diff --git a/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py b/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py

index 0c696b2..c73450c 100644 (file)
--- a/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py
+++ b/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py
@@ -6,8 +6,9 @@ from django.db import migrations, models
  def last_amount_wide(apps, schema_editor):
      SingleAmount = apps.get_model('club', 'SingleAmount')
      a = SingleAmount.objects.last()
  def last_amount_wide(apps, schema_editor):
      SingleAmount = apps.get_model('club', 'SingleAmount')
      a = SingleAmount.objects.last()
-    a.wide = True
-    a.save()
+    if a is not None:
+        a.wide = True
+        a.save()
  
  
  class Migration(migrations.Migration):
  
  
  class Migration(migrations.Migration):
diff --git a/src/opds/tests/test_opds.py b/src/opds/tests/test_opds.py

index 2c37bd4..e86b865 100644 (file)
--- a/src/opds/tests/test_opds.py
+++ b/src/opds/tests/test_opds.py
@@ -1,26 +1,20 @@
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from unittest import skipIf
  from lxml import etree
  from django.conf import settings
  import catalogue
  from catalogue.test_utils import WLTestCase, get_fixture
  from catalogue.models import Book
  from librarian import WLURI, XMLNamespace
  from lxml import etree
  from django.conf import settings
  import catalogue
  from catalogue.test_utils import WLTestCase, get_fixture
  from catalogue.models import Book
  from librarian import WLURI, XMLNamespace
-from search.index import Index
  
  AtomNS = XMLNamespace("http://www.w3.org/2005/Atom")
  
  
  
  AtomNS = XMLNamespace("http://www.w3.org/2005/Atom")
  
  
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False), 'Requires search server and NO_SEARCH_INDEX=False.')
  class OpdsSearchTests(WLTestCase):
      """Tests search feed in OPDS.."""
      def setUp(self):
          WLTestCase.setUp(self)
  class OpdsSearchTests(WLTestCase):
      """Tests search feed in OPDS.."""
      def setUp(self):
          WLTestCase.setUp(self)
-        index = Index()
-        index.index.delete_all()
-        index.index.commit()
  
          self.do_doktora = Book.from_xml_file(
              get_fixture('do-doktora.xml'))
  
          self.do_doktora = Book.from_xml_file(
              get_fixture('do-doktora.xml'))
@@ -32,7 +26,7 @@ class OpdsSearchTests(WLTestCase):
          tree = etree.fromstring(
              self.client.get('/opds/search/?%s' % query).content)
          elem_ids = tree.findall('.//%s/%s' % (AtomNS('entry'), AtomNS('id')))
          tree = etree.fromstring(
              self.client.get('/opds/search/?%s' % query).content)
          elem_ids = tree.findall('.//%s/%s' % (AtomNS('entry'), AtomNS('id')))
-        slugs = [WLURI(elem.text).slug for elem in elem_ids]
+        slugs = [WLURI.from_text(elem.text).slug for elem in elem_ids]
          self.assertEqual(set(slugs), set(b.slug for b in books), "OPDS search '%s' failed." % query)
  
      def test_opds_search_simple(self):
          self.assertEqual(set(slugs), set(b.slug for b in books), "OPDS search '%s' failed." % query)
  
      def test_opds_search_simple(self):
diff --git a/src/opds/views.py b/src/opds/views.py

index 8e929c6..63c79a2 100644 (file)
--- a/src/opds/views.py
+++ b/src/opds/views.py
@@ -16,8 +16,8 @@ from django.utils.functional import lazy
  
  from basicauth import logged_in_or_basicauth, factory_decorator
  from catalogue.models import Book, Tag
  
  from basicauth import logged_in_or_basicauth, factory_decorator
  from catalogue.models import Book, Tag
+from search.utils import UnaccentSearchQuery, UnaccentSearchVector
  
  
-from search.views import Search
  import operator
  import logging
  import re
  import operator
  import logging
  import re
@@ -350,15 +350,6 @@ class SearchFeed(AcquisitionFeed):
          'text': (10, 11),
          }
  
          'text': (10, 11),
          }
  
-    PARAMS_TO_FIELDS = {
-        'author': 'authors',
-        'translator': 'translators',
-        #        'title': 'title',
-        'categories': 'tag_name_pl',
-        'description': 'text',
-        #        'text': 'text',
-        }
-
      ATOM_PLACEHOLDER = re.compile(r"^{(atom|opds):\w+}$")
  
      def get_object(self, request):
      ATOM_PLACEHOLDER = re.compile(r"^{(atom|opds):\w+}$")
  
      def get_object(self, request):
@@ -413,30 +404,33 @@ class SearchFeed(AcquisitionFeed):
              # query is set above.
              log.debug("Inline query = [%s], criteria: %s" % (query, criteria))
  
              # query is set above.
              log.debug("Inline query = [%s], criteria: %s" % (query, criteria))
  
-        srch = Search()
-
-        book_hit_filter = srch.index.Q(book_id__any=True)
-        filters = [book_hit_filter] + [srch.index.Q(
-            **{self.PARAMS_TO_FIELDS.get(cn, cn): criteria[cn]}
-            ) for cn in self.MATCHES.keys() if cn in criteria
-            if criteria[cn]]
-
+        books = Book.objects.filter(findable=True).annotate(
+            search_vector=UnaccentSearchVector('title')
+        )
          if query:
          if query:
-            q = srch.index.query(
-                reduce(
-                    operator.or_,
-                    [srch.index.Q(**{self.PARAMS_TO_FIELDS.get(cn, cn): query}) for cn in self.MATCHES.keys()],
-                    srch.index.Q()))
-        else:
-            q = srch.index.query(srch.index.Q())
-
-        q = srch.apply_filters(q, filters).field_limit(score=True, fields=['book_id'])
-        results = q.execute()
-
-        book_scores = dict([(r['book_id'], r['score']) for r in results])
-        books = Book.objects.filter(findable=True, id__in=set([r['book_id'] for r in results]))
-        books = list(books)
-        books.sort(reverse=True, key=lambda book: book_scores[book.id])
+            squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+            books = books.filter(search_vector=squery)
+        if criteria['author']:
+            authors = Tag.objects.filter(category='author').annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=UnaccentSearchQuery(criteria['author'], config=settings.SEARCH_CONFIG))
+            books = books.filter(tag_relations__tag__in=authors)
+        if criteria['categories']:
+            tags = Tag.objects.filter(category__in=('genre', 'kind', 'epoch')).annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=UnaccentSearchQuery(criteria['categories'], config=settings.SEARCH_CONFIG))
+            books = books.filter(tag_relations__tag__in=tags)
+        if criteria['translator']:
+            # TODO
+            pass
+        if criteria['title']:
+            books = books.filter(
+                search_vector=UnaccentSearchQuery(criteria['title'], config=settings.SEARCH_CONFIG)
+            )
+
+        books = books.exclude(ancestor__in=books)
+
+        books = books.order_by('popularity__count')
          return books
  
      def get_link(self, query):
          return books
  
      def get_link(self, query):
diff --git a/src/pdcounter/models.py b/src/pdcounter/models.py

index 2e1e0b9..5e94d5e 100644 (file)
--- a/src/pdcounter/models.py
+++ b/src/pdcounter/models.py
@@ -110,15 +110,3 @@ class BookStub(models.Model):
  
      def pretty_title(self, html_links=False):
          return ', '.join((self.author, self.title))
  
      def pretty_title(self, html_links=False):
          return ', '.join((self.author, self.title))
-
-
-if not settings.NO_SEARCH_INDEX:
-    def update_index(sender, instance, **kwargs):
-        from search.index import Index
-        idx = Index()
-        idx.index_tags(instance, remove_only='created' not in kwargs)
-
-    post_delete.connect(update_index, Author)
-    post_delete.connect(update_index, BookStub)
-    post_save.connect(update_index, Author)
-    post_save.connect(update_index, BookStub)
diff --git a/src/picture/models.py b/src/picture/models.py

index b9ddcae..2dadd0c 100644 (file)
--- a/src/picture/models.py
+++ b/src/picture/models.py
@@ -180,7 +180,7 @@ class Picture(models.Model):
              return None
  
      @classmethod
              return None
  
      @classmethod
-    def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False, search_index=True):
+    def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False):
          """
          Import xml and it's accompanying image file.
          If image file is missing, it will be fetched by librarian.picture.ImageStore
          """
          Import xml and it's accompanying image file.
          If image file is missing, it will be fetched by librarian.picture.ImageStore
@@ -305,8 +305,6 @@ class Picture(models.Model):
              picture.xml_file.save("%s.xml" % picture.slug, File(xml_file))
              picture.save()
              tasks.generate_picture_html(picture.id)
              picture.xml_file.save("%s.xml" % picture.slug, File(xml_file))
              picture.save()
              tasks.generate_picture_html(picture.id)
-            if not settings.NO_SEARCH_INDEX and search_index:
-                tasks.index_picture.delay(picture.id, picture_info=picture_xml.picture_info)
  
          if close_xml_file:
              xml_file.close()
  
          if close_xml_file:
              xml_file.close()
@@ -378,17 +376,3 @@ class Picture(models.Model):
      def clear_cache(self):
          clear_cached_renders(self.mini_box)
          clear_cached_renders(self.midi_box)
      def clear_cache(self):
          clear_cached_renders(self.mini_box)
          clear_cached_renders(self.midi_box)
-
-    def search_index(self, picture_info=None, index=None, index_tags=True, commit=True):
-        if index is None:
-            from search.index import Index
-            index = Index()
-        try:
-            index.index_picture(self, picture_info)
-            if index_tags:
-                index.index_tags()
-            if commit:
-                index.index.commit()
-        except Exception as e:
-            index.index.rollback()
-            raise e
diff --git a/src/picture/tasks.py b/src/picture/tasks.py

index ff9aa13..86b9829 100644 (file)
--- a/src/picture/tasks.py
+++ b/src/picture/tasks.py
@@ -2,8 +2,6 @@
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  import json
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  import json
-from traceback import print_exc
-
  from celery import shared_task
  from django.core.files.base import ContentFile
  from django.template.loader import render_to_string
  from celery import shared_task
  from django.core.files.base import ContentFile
  from django.template.loader import render_to_string
@@ -20,14 +18,3 @@ def generate_picture_html(picture_id):
                  'themes': areas_json['themes'],
                  })
      pic.html_file.save("%s.html" % pic.slug, ContentFile(html_text))
                  'themes': areas_json['themes'],
                  })
      pic.html_file.save("%s.html" % pic.slug, ContentFile(html_text))
-
-
-@shared_task
-def index_picture(picture_id, picture_info=None, **kwargs):
-    from picture.models import Picture
-    try:
-        return Picture.objects.get(id=picture_id).search_index(picture_info, **kwargs)
-    except Exception as e:
-        print("Exception during index: %s" % e)
-        print_exc()
-        raise e
diff --git a/src/search/custom.py b/src/search/custom.py

deleted file mode 100644 (file)

index 9337157..0000000
--- a/src/search/custom.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import re
-from urllib.parse import urlencode
-import warnings
-from httplib2 import socket
-from lxml import etree
-from scorched import connection, exc, search
-
-
-class CustomSolrConnection(connection.SolrConnection):
-    def __init__(self, *args, **kw):
-        super(CustomSolrConnection, self).__init__(*args, **kw)
-        self.analysis_url = self.url + "analysis/field/"
-
-    def analyze(self, params):
-        qs = urlencode(params)
-        url = "%s?%s" % (self.analysis_url, qs)
-        if len(url) > self.max_length_get_url:
-            warnings.warn("Long query URL encountered - POSTing instead of GETting. "
-                          "This query will not be cached at the HTTP layer")
-            url = self.analysis_url
-            kwargs = dict(
-                method="POST",
-                data=qs,
-                headers={"Content-Type": "application/x-www-form-urlencoded"},
-            )
-        else:
-            kwargs = dict(method="GET")
-        response = self.request(url=url, **kwargs)
-        if response.status_code != 200:
-            raise exc.SolrError(response)
-        return response.content
-
-
-class CustomSolrInterface(connection.SolrInterface):
-    # just copied from parent and SolrConnection -> CustomSolrConnection
-    def __init__(self, url, http_connection=None, mode='',
-                 retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL,
-                 search_timeout=()):
-        """
-        :param url: url to Solr
-        :type url: str
-        :param http_connection: optional -- already existing connection
-        :type http_connection: requests connection
-        :param mode: optional -- mode (readable, writable) Solr
-        :type mode: str
-        :param retry_timeout: optional -- timeout until retry
-        :type retry_timeout: int
-        :param max_length_get_url: optional -- max length until switch to post
-        :type max_length_get_url: int
-        :param search_timeout: (optional) How long to wait for the server to
-                               send data before giving up, as a float, or a
-                               (connect timeout, read timeout) tuple.
-        :type search_timeout: float or tuple
-        """
-
-        self.conn = CustomSolrConnection(
-            url, http_connection, mode, retry_timeout, max_length_get_url)
-        self.schema = self.init_schema()
-        self._datefields = self._extract_datefields(self.schema)
-
-
-    def _analyze(self, **kwargs):
-        if not self.conn.readable:
-            raise TypeError("This Solr instance is only for writing")
-        args = {
-            'analysis_showmatch': True
-            }
-        if 'field' in kwargs:
-            args['analysis_fieldname'] = kwargs['field']
-        if 'text' in kwargs:
-            args['analysis_fieldvalue'] = kwargs['text']
-        if 'q' in kwargs:
-            args['q'] = kwargs['q']
-        if 'query' in kwargs:
-            args['q'] = kwargs['q']
-
-        params = [
-            (k.replace('_', '.'), v)
-            for (k, v) in search.params_from_dict(**args)
-        ]
-
-        content = self.conn.analyze(params)
-        doc = etree.fromstring(content)
-        return doc
-
-    def highlight(self, **kwargs):
-        doc = self._analyze(**kwargs)
-        analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
-        matches = set()
-        for wrd in analyzed:
-            start = int(wrd.xpath("int[@name='start']")[0].text)
-            end = int(wrd.xpath("int[@name='end']")[0].text)
-            matches.add((start, end))
-
-        if matches:
-            return self.substring(
-                kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
-        else:
-            return None
-
-    def analyze(self, **kwargs):
-        doc = self._analyze(**kwargs)
-        terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
-        terms = map(lambda n: str(n.text), terms)
-        return terms
-
-    def expand_margins(self, text, start, end):
-        totlen = len(text)
-
-        def is_boundary(x):
-            ws = re.compile(r"\W", re.UNICODE)
-            return bool(ws.match(x))
-
-        while start > 0:
-            if is_boundary(text[start - 1]):
-                break
-            start -= 1
-
-        while end < totlen - 1:
-            if is_boundary(text[end + 1]):
-                break
-            end += 1
-
-        return start, end
-
-    def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
-        totlen = len(text)
-        matches_margins = [
-            ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
-
-        # lets start with first match
-        (start, end) = matches_margins[0][1]
-        new_matches = [matches_margins[0][0]]
-
-        for (m, (s, e)) in matches_margins[1:]:
-            if end < s or start > e:
-                continue
-            start = min(start, s)
-            end = max(end, e)
-            new_matches.append(m)
-
-        snip = text[start:end]
-        new_matches.sort(key=lambda a: -a[0])
-
-        for (s, e) in new_matches:
-            off = -start
-            snip = snip[:e + off] + mark[1] + snip[e + off:]
-            snip = snip[:s + off] + mark[0] + snip[s + off:]
-        snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)
-
-        return snip
diff --git a/src/search/forms.py b/src/search/forms.py

index 176c73e..3f6c99b 100644 (file)
--- a/src/search/forms.py
+++ b/src/search/forms.py
@@ -158,8 +158,8 @@ class SearchFilters(forms.Form):
      def results(self):
          qs = self.get_querysets()
          query = self.cleaned_data['q']
      def results(self):
          qs = self.get_querysets()
          query = self.cleaned_data['q']
-        squery = UnaccentSearchQuery(query, config='polish')
-        query = SearchQuery(query, config='polish')
+        squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+        query = SearchQuery(query, config=settings.SEARCH_CONFIG)
          books = qs['book'].annotate(
              search_vector=UnaccentSearchVector('title')
          ).filter(search_vector=squery)
          books = qs['book'].annotate(
              search_vector=UnaccentSearchVector('title')
          ).filter(search_vector=squery)
@@ -169,7 +169,7 @@ class SearchFilters(forms.Form):
                      headline=SearchHeadline(
                          'text',
                          query,
                      headline=SearchHeadline(
                          'text',
                          query,
-                        config='polish',
+                        config=settings.SEARCH_CONFIG,
                          start_sel='<strong>',
                          stop_sel='</strong>',
                      )
                          start_sel='<strong>',
                          stop_sel='</strong>',
                      )
diff --git a/src/search/index.py b/src/search/index.py

index 4606f57..fc9e9d5 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -1,299 +1,15 @@
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from functools import reduce, total_ordering
-from itertools import chain
-import logging
-import operator
-import os
  import re
  import re
-from django.conf import settings
-from librarian import dcparser
-import librarian.meta.types.person
-import librarian.meta.types.text
  from librarian.parser import WLDocument
  from lxml import etree
  from librarian.parser import WLDocument
  from lxml import etree
-import scorched
-import catalogue.models
-import picture.models
-from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from wolnelektury.utils import makedirs
-from . import custom
  
  
-log = logging.getLogger('search')
  
  
-
-if os.path.isfile(settings.SOLR_STOPWORDS):
-    stopwords = set(
-        line.strip()
-        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
-else:
-    stopwords = set()
-
-
-class SolrIndex(object):
-    def __init__(self, mode=None):
-        self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
-
-
-class Snippets(object):
-    """
-    This class manages snippet files for indexed object (book)
-    the snippets are concatenated together, and their positions and
-    lengths are kept in lucene index fields.
-    """
-    SNIPPET_DIR = "snippets"
-
-    def __init__(self, book_id, revision=None):
-        makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
-        self.book_id = book_id
-        self.revision = revision
-        self.file = None
-        self.position = None
-
-    @property
-    def path(self):
-        if self.revision:
-            fn = "%d.%d" % (self.book_id, self.revision)
-        else:
-            fn = "%d" % self.book_id
-
-        return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
-
-    def open(self, mode='r'):
-        """
-        Open the snippet file. Call .close() afterwards.
-        """
-        if 'b' not in mode:
-            mode += 'b'
-
-        if 'w' in mode:
-            if os.path.exists(self.path):
-                self.revision = 1
-                while True:
-                    if not os.path.exists(self.path):
-                        break
-                    self.revision += 1
-
-        self.file = open(self.path, mode)
-        self.position = 0
-        return self
-
-    def add(self, snippet):
-        """
-        Append a snippet (unicode) to the snippet file.
-        Return a (position, length) tuple
-        """
-        txt = snippet.encode('utf-8')
-        l = len(txt)
-        self.file.write(txt)
-        pos = (self.position, l)
-        self.position += l
-        return pos
-
-    def get(self, pos):
-        """
-        Given a tuple of (position, length) return an unicode
-        of the snippet stored there.
-        """
-        self.file.seek(pos[0], 0)
-        try:
-            txt = self.file.read(pos[1]).decode('utf-8')
-        except:
-            return ''
-        return txt
-
-    def close(self):
-        """Close snippet file"""
-        if self.file:
-            self.file.close()
-
-    def remove(self):
-        self.revision = None
-        try:
-            os.unlink(self.path)
-            self.revision = 0
-            while True:
-                self.revision += 1
-                os.unlink(self.path)
-        except OSError:
-            pass
-
-
-class Index(SolrIndex):
+class Index:
      """
      Class indexing books.
      """
      """
      Class indexing books.
      """
-    def __init__(self):
-        super(Index, self).__init__(mode='rw')
-
-    def remove_snippets(self, book):
-        book.snippet_set.all().delete()
-
-    def add_snippet(self, book, doc):
-        assert book.id == doc.pop('book_id')
-        # Fragments already exist and can be indexed where they live.
-        if 'fragment_anchor' in doc:
-            return
-
-        text = doc.pop('text')
-        header_index = doc.pop('header_index')
-        book.snippet_set.create(
-            sec=header_index,
-            text=text,
-        )
-
-    def delete_query(self, *queries):
-        """
-        index.delete(queries=...) doesn't work, so let's reimplement it
-        using deletion of list of uids.
-        """
-        uids = set()
-        for q in queries:
-            if isinstance(q, scorched.search.LuceneQuery):
-                q = self.index.query(q)
-            q.field_limiter.update(['uid'])
-            st = 0
-            rows = 100
-            while True:
-                ids = q.paginate(start=st, rows=rows).execute()
-                if not len(ids):
-                    break
-                for res in ids:
-                    uids.add(res['uid'])
-                st += rows
-        if uids:
-            # FIXME: With Solr API change, this doesn't work.
-            #self.index.delete(uids)
-            return True
-        else:
-            return False
-
-    def index_tags(self, *tags, **kw):
-        """
-        Re-index global tag list.
-        Removes all tags from index, then index them again.
-        Indexed fields include: id, name (with and without polish stems), category
-        """
-        log.debug("Indexing tags")
-        remove_only = kw.get('remove_only', False)
-        # first, remove tags from index.
-        if tags:
-            tag_qs = []
-            for tag in tags:
-                q_id = self.index.Q(tag_id=tag.id)
-
-                if isinstance(tag, PDCounterAuthor):
-                    q_cat = self.index.Q(tag_category='pd_author')
-                elif isinstance(tag, PDCounterBook):
-                    q_cat = self.index.Q(tag_category='pd_book')
-                else:
-                    q_cat = self.index.Q(tag_category=tag.category)
-
-                q_id_cat = self.index.Q(q_id & q_cat)
-                tag_qs.append(q_id_cat)
-            self.delete_query(*tag_qs)
-        else:  # all
-            q = self.index.Q(tag_id__any=True)
-            self.delete_query(q)
-
-        if not remove_only:
-            # then add them [all or just one passed]
-            if not tags:
-                tags = chain(
-                    catalogue.models.Tag.objects.exclude(category='set'),
-                    PDCounterAuthor.objects.all(),
-                    PDCounterBook.objects.all())
-
-            for tag in tags:
-                if isinstance(tag, PDCounterAuthor):
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.name,
-                        "tag_name_pl": tag.name,
-                        "tag_category": 'pd_author',
-                        "is_pdcounter": True,
-                        "uid": "tag%d_pd_a" % tag.id
-                        }
-                elif isinstance(tag, PDCounterBook):
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.title,
-                        "tag_name_pl": tag.title,
-                        "tag_category": 'pd_book',
-                        "is_pdcounter": True,
-                        "uid": "tag%d_pd_b" % tag.id
-                        }
-                else:
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.name,
-                        "tag_name_pl": tag.name,
-                        "tag_category": tag.category,
-                        "is_pdcounter": False,
-                        "uid": "tag%d" % tag.id
-                        }
-                self.index.add(doc)
-
-    def create_book_doc(self, book):
-        """
-        Create a lucene document referring book id.
-        """
-        doc = {'book_id': int(book.id)}
-        if book.parent is not None:
-            doc['parent_id'] = int(book.parent.id)
-        return doc
-
-    def remove_book(self, book, remove_snippets=True, legacy=True):
-        """Removes a book from search index.
-        book - Book instance."""
-        if legacy:
-          self.delete_query(self.index.Q(book_id=book.id))
-
-          if remove_snippets:
-            snippets = Snippets(book.id)
-            snippets.remove()
-        self.remove_snippets(book)
-
-    def index_book(self, book, book_info=None, overwrite=True, legacy=True):
-        """
-        Indexes the book.
-        Creates a lucene document for extracted metadata
-        and calls self.index_content() to index the contents of the book.
-        """
-        if not book.xml_file: return
-
-        if overwrite:
-            # we don't remove snippets, since they might be still needed by
-            # threads using not reopened index
-            self.remove_book(book, remove_snippets=False, legacy=legacy)
-
-        book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=[
-            'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
-        # let's not index it - it's only used for extracting publish date
-        if 'source_name' in meta_fields:
-            del meta_fields['source_name']
-
-        for n, f in meta_fields.items():
-            book_doc[n] = f
-
-        book_doc['uid'] = "book%s" % book_doc['book_id']
-        if legacy:
-            self.index.add(book_doc)
-        del book_doc
-        book_fields = {
-            'title': meta_fields['title'],
-            'authors': meta_fields['authors'],
-            'published_date': meta_fields['published_date']
-            }
-
-        for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
-            if tag_name in meta_fields:
-                book_fields[tag_name] = meta_fields[tag_name]
-
-        self.index_content(book, book_fields=book_fields, legacy=legacy)
-
      master_tags = [
          'opowiadanie',
          'powiesc',
      master_tags = [
          'opowiadanie',
          'powiesc',
@@ -307,7 +23,7 @@ class Index(SolrIndex):
          'uwaga', 'extra', 'nota_red', 'abstrakt',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
          'uwaga', 'extra', 'nota_red', 'abstrakt',
          'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
          'didaskalia',
-        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
+        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
      ]
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
      ]
  
      footnote_tags = ['pa', 'pt', 'pr', 'pe']
@@ -315,85 +31,41 @@ class Index(SolrIndex):
      skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
                          '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
      skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
                          '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  
-    published_date_re = re.compile("([0-9]+)[\]. ]*$")
-
-    def extract_metadata(self, book, book_info=None, dc_only=None):
-        """
-        Extract metadata from book and returns a map of fields keyed by fieldname
-        """
-        fields = {}
-
-        if book_info is None:
-            book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
-
-        fields['slug'] = book.slug
-        fields['is_book'] = True
-
-        # validator, name
-        for field in dcparser.BookInfo.FIELDS:
-            if dc_only and field.name not in dc_only:
-                continue
-            if hasattr(book_info, field.name):
-                if not getattr(book_info, field.name):
-                    continue
-                type_indicator = field.value_type
-                if issubclass(type_indicator, librarian.meta.types.text.TextValue):
-                    s = getattr(book_info, field.name)
-                    if field.multiple:
-                        s = ', '.join(s)
-                    fields[field.name] = s
-                elif issubclass(type_indicator, librarian.meta.types.person.Person):
-                    p = getattr(book_info, field.name)
-                    if isinstance(p, librarian.meta.types.person.Person):
-                        persons = str(p)
-                    else:
-                        persons = ', '.join(map(str, p))
-                    fields[field.name] = persons
-
-        # get published date
-        pd = None
-        if hasattr(book_info, 'source_name') and book_info.source_name:
-            match = self.published_date_re.search(book_info.source_name)
-            if match is not None:
-                pd = str(match.groups()[0])
-        if not pd:
-            pd = ""
-        fields["published_date"] = pd
-
-        return fields
-
-    # def add_gaps(self, fields, fieldname):
-    #     """
-    #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
-    #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
-    #     """
-    #     def gap():
-    #         while True:
-    #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
-    #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
-
-    def get_master(self, root):
+    @classmethod
+    def get_master(cls, root):
          """
          Returns the first master tag from an etree.
          """
          for master in root.iter():
          """
          Returns the first master tag from an etree.
          """
          for master in root.iter():
-            if master.tag in self.master_tags:
+            if master.tag in cls.master_tags:
                  return master
  
                  return master
  
-    def index_content(self, book, book_fields, legacy=True):
+    @staticmethod
+    def add_snippet(book, text, position):
+        book.snippet_set.create(
+            sec=position + 1,
+            text=text
+        )
+
+    @classmethod
+    def index_book(cls, book):
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
+        if not book.xml_file: return
+
+        book.snippet_set.all().delete()
+
          wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
          root = wld.edoc.getroot()
  
          wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
          root = wld.edoc.getroot()
  
-        master = self.get_master(root)
+        master = cls.get_master(root)
          if master is None:
              return []
  
          def walker(node):
          if master is None:
              return []
  
          def walker(node):
-            if node.tag not in self.ignore_content_tags:
+            if node.tag not in cls.ignore_content_tags:
                  yield node, None, None
                  if node.text is not None:
                      yield None, node.text, None
                  yield node, None, None
                  if node.text is not None:
                      yield None, node.text, None
@@ -407,627 +79,43 @@ class Index(SolrIndex):
              return
  
          def fix_format(text):
              return
  
          def fix_format(text):
-            # separator = [" ", "\t", ".", ";", ","]
              if isinstance(text, list):
              if isinstance(text, list):
-                # need to join it first
                  text = filter(lambda s: s is not None, content)
                  text = ' '.join(text)
                  text = filter(lambda s: s is not None, content)
                  text = ' '.join(text)
-                # for i in range(len(text)):
-                #     if i > 0:
-                #         if text[i][0] not in separator\
-                #             and text[i - 1][-1] not in separator:
-                #          text.insert(i, " ")
  
              return re.sub("(?m)/$", "", text)
  
  
              return re.sub("(?m)/$", "", text)
  
-        def add_part(snippets, **fields):
-            doc = self.create_book_doc(book)
-            for n, v in book_fields.items():
-                doc[n] = v
-
-            doc['header_index'] = fields["header_index"]
-            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
-            doc['header_type'] = fields['header_type']
-
-            doc['text'] = fields['text']
-
-            # snippets
-            snip_pos = snippets.add(fields["text"])
-
-            doc['snippets_position'] = snip_pos[0]
-            doc['snippets_length'] = snip_pos[1]
-            if snippets.revision:
-                doc["snippets_revision"] = snippets.revision
-
-            if 'fragment_anchor' in fields:
-                doc["fragment_anchor"] = fields['fragment_anchor']
-
-            if 'themes' in fields:
-                doc['themes'] = fields['themes']
-            doc['uid'] = "part%s-%s-%s-%s" % (
-                book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
-            return doc
-
-        fragments = {}
-        snippets = Snippets(book.id).open('w')
-        try:
-            for header, position in zip(list(master), range(len(master))):
-
-                if header.tag in self.skip_header_tags:
-                    continue
-                if header.tag is etree.Comment:
-                    continue
-
-                # section content
-                content = []
-                footnote = []
-
-                def all_content(text):
-                    for frag in fragments.values():
-                        frag['text'].append(text)
-                    content.append(text)
-                handle_text = [all_content]
-
-                for start, text, end in walker(header):
-                    # handle footnotes
-                    if start is not None and start.tag in self.footnote_tags:
-                        footnote = []
-
-                        def collect_footnote(t):
-                            footnote.append(t)
-
-                        handle_text.append(collect_footnote)
-                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
-                        handle_text.pop()
-                        doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       text=''.join(footnote))
-                        self.add_snippet(book, doc)
-                        if legacy:
-                            self.index.add(doc)
-                        footnote = []
-
-                    # handle fragments and themes.
-                    if start is not None and start.tag == 'begin':
-                        fid = start.attrib['id'][1:]
-                        fragments[fid] = {
-                            'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-
-                    # themes for this fragment
-                    elif start is not None and start.tag == 'motyw':
-                        fid = start.attrib['id'][1:]
-                        handle_text.append(lambda text: None)
-                        if start.text is not None:
-                            fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
-                    elif end is not None and end.tag == 'motyw':
-                        handle_text.pop()
-
-                    elif start is not None and start.tag == 'end':
-                        fid = start.attrib['id'][1:]
-                        if fid not in fragments:
-                            continue  # a broken <end> node, skip it
-                        frag = fragments[fid]
-                        if not frag['themes']:
-                            continue  # empty themes list.
-                        del fragments[fid]
-
-                        doc = add_part(snippets,
-                                       header_type=frag['start_header'],
-                                       header_index=frag['start_section'],
-                                       header_span=position - frag['start_section'] + 1,
-                                       fragment_anchor=fid,
-                                       text=fix_format(frag['text']),
-                                       themes=frag['themes'])
-                        # Add searchable fragment
-                        self.add_snippet(book, doc)
-                        if legacy:
-                            self.index.add(doc)
-
-                        # Collect content.
-
-                    if text is not None and handle_text is not []:
-                        hdl = handle_text[-1]
-                        hdl(text)
-
-                        # in the end, add a section text.
-                doc = add_part(snippets, header_index=position,
-                               header_type=header.tag, text=fix_format(content))
-
-                self.add_snippet(book, doc)
-                if legacy:
-                    self.index.add(doc)
-
-        finally:
-            snippets.close()
-
-    def remove_picture(self, picture_or_id):
-        """Removes a picture from search index."""
-        if isinstance(picture_or_id, picture.models.Picture):
-            picture_id = picture_or_id.id
-        else:
-            picture_id = picture_or_id
-        self.delete_query(self.index.Q(picture_id=picture_id))
-
-    def index_picture(self, picture, picture_info=None, overwrite=True):
-        """
-        Indexes the picture.
-        Creates a lucene document for extracted metadata
-        and calls self.index_area() to index the contents of the picture.
-        """
-        if overwrite:
-            # we don't remove snippets, since they might be still needed by
-            # threads using not reopened index
-            self.remove_picture(picture)
-
-        picture_doc = {'picture_id': int(picture.id)}
-        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
-            'authors', 'title', 'epochs', 'kinds', 'genres'])
-
-        picture_doc.update(meta_fields)
-
-        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
-        self.index.add(picture_doc)
-        del picture_doc['is_book']
-        for area in picture.areas.all():
-            self.index_area(area, picture_fields=picture_doc)
-
-    def index_area(self, area, picture_fields):
-        """
-        Indexes themes and objects on the area.
-        """
-        doc = dict(picture_fields)
-        doc['area_id'] = area.id
-        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
-        doc['uid'] = 'area%s' % area.id
-        self.index.add(doc)
-
-
-@total_ordering
-class SearchResult(object):
-    def __init__(self, doc, how_found=None, query_terms=None):
-        self.boost = 1.0
-        self._hits = []
-        self._processed_hits = None  # processed hits
-        self.snippets = []
-        self.query_terms = query_terms
-        self._book = None
-
-        if 'score' in doc:
-            self._score = doc['score']
-        else:
-            self._score = 0
-
-        self.book_id = int(doc["book_id"])
-
-        try:
-            self.published_date = int(doc.get("published_date"))
-        except ValueError:
-            self.published_date = 0
-
-        # content hits
-        header_type = doc.get("header_type", None)
-        # we have a content hit in some header of fragment
-        if header_type is not None:
-            sec = (header_type, int(doc["header_index"]))
-            header_span = doc['header_span']
-            header_span = header_span is not None and int(header_span) or 1
-            fragment = doc.get("fragment_anchor", None)
-            snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc.get('snippets_revision', None)
-
-            hit = (sec + (header_span,), fragment, self._score, {
-                'how_found': how_found,
-                'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev,
-                'themes': doc.get('themes', []),
-                'themes_pl': doc.get('themes_pl', [])
-                })
-
-            self._hits.append(hit)
-
-    @classmethod
-    def from_book(cls, book, how_found=None, query_terms=None):
-        doc = {
-            'score': book.popularity.count,
-            'book_id': book.id,
-            'published_date': 0,
-        }
-        result = cls(doc, how_found=how_found, query_terms=query_terms)
-        result._book = book
-        return result
-
-    def __str__(self):
-        return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
-            (self.book_id, len(self._hits),
-             len(self._processed_hits) if self._processed_hits else -1,
-             self._score, len(self.snippets))
-
-    def __bytes__(self):
-        return str(self).encode('utf-8')
-
-    @property
-    def score(self):
-        return self._score * self.boost
-
-    def merge(self, other):
-        if self.book_id != other.book_id:
-            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
-        self._hits += other._hits
-        self._score += max(other._score, 0)
-        return self
-
-    def get_book(self):
-        if self._book is not None:
-            return self._book
-        try:
-            self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
-        except catalogue.models.Book.DoesNotExist:
-            self._book = None
-        return self._book
-
-    book = property(get_book)
-
-    POSITION = 0
-    FRAGMENT = 1
-    POSITION_INDEX = 1
-    POSITION_SPAN = 2
-    SCORE = 2
-    OTHER = 3
-
-    @property
-    def hits(self):
-        if self._processed_hits is not None:
-            return self._processed_hits
-
-        # to sections and fragments
-        frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
-
-        sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
-
-        # sections not covered by fragments
-        sect = filter(lambda s: 0 == len(list(filter(
-            lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
-                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
-
-        def remove_duplicates(lst, keyfn, larger):
-            els = {}
-            for e in lst:
-                eif = keyfn(e)
-                if eif in els:
-                    if larger(els[eif], e):
-                        continue
-                els[eif] = e
-            return els.values()
-
-        # remove fragments with duplicated fid's and duplicated snippets
-        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
-
-        # remove duplicate sections
-        sections = {}
-
-        for s in sect:
-            si = s[self.POSITION][self.POSITION_INDEX]
-            # skip existing
-            if si in sections:
-                if sections[si]['score'] >= s[self.SCORE]:
-                    continue
-
-            m = {'score': s[self.SCORE],
-                 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
-                 }
-            m.update(s[self.OTHER])
-            sections[si] = m
-
-        hits = list(sections.values())
-
-        for f in frags:
-            try:
-                frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
-            except catalogue.models.Fragment.DoesNotExist:
-                # stale index
+        for position, header in enumerate(master):
+            if header.tag in cls.skip_header_tags:
                  continue
                  continue
-            # Figure out if we were searching for a token matching some word in theme name.
-            themes = frag.tags.filter(category='theme')
-            themes_hit = set()
-            if self.query_terms is not None:
-                for i in range(0, len(f[self.OTHER]['themes'])):
-                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(str.lower, tms)
-                    for qt in self.query_terms:
-                        if qt in tms:
-                            themes_hit.add(f[self.OTHER]['themes'][i])
-                            break
-
-            def theme_by_name(n):
-                th = list(filter(lambda t: t.name == n, themes))
-                if th:
-                    return th[0]
-                else:
-                    return None
-            themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
-
-            m = {'score': f[self.SCORE],
-                 'fragment': frag,
-                 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
-                 'themes': themes,
-                 'themes_hit': themes_hit
-                 }
-            m.update(f[self.OTHER])
-            hits.append(m)
-
-        hits.sort(key=lambda h: h['score'], reverse=True)
-
-        self._processed_hits = hits
-
-        return hits
-
-    @staticmethod
-    def aggregate(*result_lists):
-        books = {}
-        for rl in result_lists:
-            for r in rl:
-                if r.book_id in books:
-                    books[r.book_id].merge(r)
-                else:
-                    books[r.book_id] = r
-        return books.values()
-
-    def get_sort_key(self):
-        return (-self.score,
-                self.published_date,
-                self.book.sort_key_author if self.book else '',
-                self.book.sort_key if self.book else '')
-
-    def __lt__(self, other):
-        return self.get_sort_key() > other.get_sort_key()
-
-    def __eq__(self, other):
-        return self.get_sort_key() == other.get_sort_key()
-
-    def __len__(self):
-        return len(self.hits)
-
-    def snippet_pos(self, idx=0):
-        return self.hits[idx]['snippets_pos']
-
-    def snippet_revision(self, idx=0):
-        try:
-            return self.hits[idx]['snippets_revision']
-        except (IndexError, KeyError):
-            return None
-
-
-@total_ordering
-class PictureResult(object):
-    def __init__(self, doc, how_found=None, query_terms=None):
-        self.boost = 1.0
-        self.query_terms = query_terms
-        self._picture = None
-        self._hits = []
-        self._processed_hits = None
-
-        if 'score' in doc:
-            self._score = doc['score']
-        else:
-            self._score = 0
-
-        self.picture_id = int(doc["picture_id"])
-
-        if doc.get('area_id'):
-            hit = (self._score, {
-                'how_found': how_found,
-                'area_id': doc['area_id'],
-                'themes': doc.get('themes', []),
-                'themes_pl': doc.get('themes_pl', []),
-            })
-
-            self._hits.append(hit)
-
-    def __str__(self):
-        return "<PR id=%d score=%f >" % (self.picture_id, self._score)
-
-    def __repr__(self):
-        return str(self)
-
-    @property
-    def score(self):
-        return self._score * self.boost
-
-    def merge(self, other):
-        if self.picture_id != other.picture_id:
-            raise ValueError(
-                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
-        self._hits += other._hits
-        self._score += max(other._score, 0)
-        return self
-
-    SCORE = 0
-    OTHER = 1
-
-    @property
-    def hits(self):
-        if self._processed_hits is not None:
-            return self._processed_hits
-
-        hits = []
-        for hit in self._hits:
-            try:
-                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
-            except picture.models.PictureArea.DoesNotExist:
-                # stale index
+            if header.tag is etree.Comment:
                  continue
                  continue
-            # Figure out if we were searching for a token matching some word in theme name.
-            themes_hit = set()
-            if self.query_terms is not None:
-                for i in range(0, len(hit[self.OTHER]['themes'])):
-                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(str.lower, tms)
-                    for qt in self.query_terms:
-                        if qt in tms:
-                            themes_hit.add(hit[self.OTHER]['themes'][i])
-                            break
  
  
-            m = {
-                'score': hit[self.SCORE],
-                'area': area,
-                'themes_hit': themes_hit,
-            }
-            m.update(hit[self.OTHER])
-            hits.append(m)
+            # section content
+            content = []
+            footnote = []
  
  
-        hits.sort(key=lambda h: h['score'], reverse=True)
-        hits = hits[:1]
-        self._processed_hits = hits
-        return hits
+            def all_content(text):
+                content.append(text)
+            handle_text = [all_content]
  
  
-    def get_picture(self):
-        if self._picture is None:
-            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
-        return self._picture
-
-    picture = property(get_picture)
-
-    @staticmethod
-    def aggregate(*result_lists):
-        books = {}
-        for rl in result_lists:
-            for r in rl:
-                if r.picture_id in books:
-                    books[r.picture_id].merge(r)
-                else:
-                    books[r.picture_id] = r
-        return books.values()
+            for start, text, end in walker(header):
+                # handle footnotes
+                if start is not None and start.tag in cls.footnote_tags:
+                    footnote = []
  
  
-    def __lt__(self, other):
-        return self.score < other.score
+                    def collect_footnote(t):
+                        footnote.append(t)
  
  
-    def __eq__(self, other):
-        return self.score == other.score
-
-
-class Search(SolrIndex):
-    """
-    Search facilities.
-    """
-    def __init__(self, default_field="text"):
-        super(Search, self).__init__(mode='r')
-
-    def make_term_query(self, query, field='text', modal=operator.or_):
-        """
-        Returns term queries joined by boolean query.
-        modal - applies to boolean query
-        fuzzy - should the query by fuzzy.
-        """
-        if query is None:
-            query = ''
-        q = self.index.Q()
-        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
-
-        return q
-
-    def search_by_author(self, words):
-        from catalogue.models import Book
-        books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
-        for word in words:
-            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
-        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
-
-    def search_words(self, words, fields, required=None, book=True, picture=False):
-        if book and not picture and fields == ['authors']:
-            return self.search_by_author(words)
-        filters = []
-        for word in words:
-            if book or picture or (word not in stopwords):
-                word_filter = None
-                for field in fields:
-                    q = self.index.Q(**{field: word})
-                    if word_filter is None:
-                        word_filter = q
-                    else:
-                        word_filter |= q
-                filters.append(word_filter)
-        if required:
-            required_filter = None
-            for field in required:
-                for word in words:
-                    if book or picture or (word not in stopwords):
-                        q = self.index.Q(**{field: word})
-                        if required_filter is None:
-                            required_filter = q
-                        else:
-                            required_filter |= q
-            filters.append(required_filter)
-        if not filters:
-            return []
-        params = {}
-        if book:
-            params['is_book'] = True
-        if picture:
-            params['picture_id__gt'] = 0
-        else:
-            params['book_id__gt'] = 0
-        query = self.index.query(**params)
-        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        result_class = PictureResult if picture else SearchResult
-        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
-
-    def get_snippets(self, searchresult, query, field='text', num=1):
-        """
-        Returns a snippet for found scoreDoc.
-        """
-        maxnum = len(searchresult)
-        if num is None or num < 0 or num > maxnum:
-            num = maxnum
-        book_id = searchresult.book_id
-        revision = searchresult.snippet_revision()
-        snippets = Snippets(book_id, revision=revision)
-        snips = [None] * maxnum
-        try:
-            snippets.open()
-            idx = 0
-            while idx < maxnum and num > 0:
-                position, length = searchresult.snippet_pos(idx)
-                if position is None or length is None:
-                    continue
-                text = snippets.get((int(position),
-                                     int(length)))
-                snip = self.index.highlight(text=text, field=field, q=query)
-                if not snip and field == 'text':
-                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
-                if snip not in snips:
-                    snips[idx] = snip
-                    if snip:
-                        num -= 1
-                idx += 1
-
-        except IOError as e:
-            book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
-            if not book:
-                log.error("Book does not exist for book id = %d" % book_id)
-            elif not book.get().children.exists():
-                log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
-            return []
-        finally:
-            snippets.close()
-
-        # remove verse end markers..
-        snips = [s.replace("/\n", "\n") if s else s for s in snips]
-
-        searchresult.snippets = snips
-
-        return snips
-
-    @staticmethod
-    def apply_filters(query, filters):
-        """
-        Apply filters to a query
-        """
-        if filters is None:
-            filters = []
-        filters = filter(lambda x: x is not None, filters)
-        for f in filters:
-            query = query.query(f)
-        return query
+                    handle_text.append(collect_footnote)
+                elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
+                    handle_text.pop()
+                    cls.add_snippet(book, ''.join(footnote), position)
+                    footnote = []
  
  
+                if text is not None and handle_text is not []:
+                    hdl = handle_text[-1]
+                    hdl(text)
  
  
-if getattr(settings, 'SEARCH_MOCK', False):
-    from .mock_search import Search
+            # in the end, add a section text.
+            cls.add_snippet(book, fix_format(content), position)
diff --git a/src/search/management/__init__.py b/src/search/management/__init__.py

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/src/search/management/commands/__init__.py b/src/search/management/commands/__init__.py

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/src/search/management/commands/reindex.py b/src/search/management/commands/reindex.py

deleted file mode 100644 (file)

index c2fe78e..0000000
--- a/src/search/management/commands/reindex.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
-    """Ask a yes/no question via raw_input() and return their answer.
-
-    "question" is a string that is presented to the user.
-    "default" is the presumed answer if the user just hits <Enter>.
-        It must be "yes" (the default), "no" or None (meaning
-        an answer is required of the user).
-
-    The "answer" return value is one of "yes" or "no".
-    """
-    valid = {"yes": True, "y": True, "ye": True,
-             "no": False, "n": False}
-    if default is None:
-        prompt = " [y/n] "
-    elif default == "yes":
-        prompt = " [Y/n] "
-    elif default == "no":
-        prompt = " [y/N] "
-    else:
-        raise ValueError("invalid default answer: '%s'" % default)
-
-    while True:
-        sys.stdout.write(question + prompt)
-        choice = raw_input().lower()
-        if default is not None and choice == '':
-            return valid[default]
-        elif choice in valid:
-            return valid[choice]
-        else:
-            sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
-    help = 'Reindex everything.'
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-                '-n', '--book-id', action='store_true', dest='book_id',
-                default=False, help='book id instead of slugs')
-        parser.add_argument(
-                '-t', '--just-tags', action='store_true', dest='just_tags',
-                default=False, help='just reindex tags')
-        parser.add_argument(
-                '--start', dest='start_from', default=None,
-                help='start from this slug')
-        parser.add_argument(
-                '--stop', dest='stop_after', default=None,
-                help='stop after this slug')
-        parser.add_argument('args', nargs='*', metavar='slug/id')
-
-    def handle(self, **opts):
-        from catalogue.models import Book
-        from search.index import Index
-        idx = Index()
-        
-        if not opts['just_tags']:
-            if opts['args']:
-                books = []
-                for a in opts['args']:
-                    if opts['book_id']:
-                        books += Book.objects.filter(id=int(a)).all()
-                    else:
-                        books += Book.objects.filter(slug=a).all()
-            else:
-                books = list(Book.objects.order_by('slug'))
-            start_from = opts.get('start_from')
-            stop_after = opts.get('stop_after')
-            if start_from:
-                start_from = start_from.replace('-', '')
-            if stop_after:
-                stop_after = stop_after.replace('-', '')
-            while books:
-                try:
-                    b = books[0]
-                    slug = b.slug.replace('-', '')
-                    if stop_after and slug > stop_after:
-                        break
-                    if not start_from or slug >= start_from:
-                        print(b.slug)
-                        idx.index_book(b)
-                        idx.index.commit()
-                    books.pop(0)
-                except:
-                    traceback.print_exc()
-                    try:
-                        # we might not be able to rollback
-                        idx.index.rollback()
-                    except:
-                        pass
-                    retry = query_yes_no("Retry?")
-                    if not retry:
-                        break
-
-        print('Reindexing tags.')
-        idx.index_tags()
-        idx.index.commit()
diff --git a/src/search/management/commands/reindex_pictures.py b/src/search/management/commands/reindex_pictures.py

deleted file mode 100644 (file)

index 8505189..0000000
--- a/src/search/management/commands/reindex_pictures.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
-    """Ask a yes/no question via raw_input() and return their answer.
-
-    "question" is a string that is presented to the user.
-    "default" is the presumed answer if the user just hits <Enter>.
-        It must be "yes" (the default), "no" or None (meaning
-        an answer is required of the user).
-
-    The "answer" return value is one of "yes" or "no".
-    """
-    valid = {"yes": True, "y": True, "ye": True,
-             "no": False, "n": False}
-    if default is None:
-        prompt = " [y/n] "
-    elif default == "yes":
-        prompt = " [Y/n] "
-    elif default == "no":
-        prompt = " [y/N] "
-    else:
-        raise ValueError("invalid default answer: '%s'" % default)
-
-    while True:
-        sys.stdout.write(question + prompt)
-        choice = raw_input().lower()
-        if default is not None and choice == '':
-            return valid[default]
-        elif choice in valid:
-            return valid[choice]
-        else:
-            sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
-    help = 'Reindex pictures.'
-
-    def add_arguments(self, parser):
-        self.add_argument(
-                '-n', '--picture-id', action='store_true', dest='picture_id',
-                default=False, help='picture id instead of slugs')
-        self.add_argument('slug/id', nargs='*', metavar='slug/id')
-
-    def handle(self, **opts):
-        from picture.models import Picture
-        from search.index import Index
-        idx = Index()
-
-        if opts['args']:
-            pictures = []
-            for a in opts['args']:
-                if opts['picture_id']:
-                    pictures += Picture.objects.filter(id=int(a)).all()
-                else:
-                    pictures += Picture.objects.filter(slug=a).all()
-        else:
-            pictures = list(Picture.objects.order_by('slug'))
-        while pictures:
-            try:
-                p = pictures[0]
-                print(p.slug)
-                idx.index_picture(p)
-                idx.index.commit()
-                pictures.pop(0)
-            except:
-                traceback.print_exc()
-                try:
-                    # we might not be able to rollback
-                    idx.index.rollback()
-                except:
-                    pass
-                retry = query_yes_no("Retry?")
-                if not retry:
-                    break
diff --git a/src/search/management/commands/snippets.py b/src/search/management/commands/snippets.py

deleted file mode 100644 (file)

index 62512c9..0000000
--- a/src/search/management/commands/snippets.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from glob import glob
-from os import path
-from django.conf import settings
-from django.core.management.base import BaseCommand
-
-
-class Command(BaseCommand):
-    help = 'Check snippets.'
-
-    def handle(self, *args, **opts):
-        sfn = glob(settings.SEARCH_INDEX+'snippets/*')
-        for fn in sfn:
-            print(fn)
-            bkid = path.basename(fn)
-            with open(fn) as f:
-                cont = f.read()
-                try:
-                    cont.decode('utf-8')
-                except UnicodeDecodeError:
-                    print("error in snippets %s" % bkid)
diff --git a/src/search/mock_search.py b/src/search/mock_search.py

deleted file mode 100644 (file)

index 33d2a5e..0000000
--- a/src/search/mock_search.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from unittest.mock import Mock
-from catalogue.models import Book, Tag
-from random import randint, choice
-
-
-class Search(Mock):
-    """
-    Search mock for development without setting up Solr.
-
-    Instead of connecting to an actual search server, it returns
-    some random results for any query.
-    """
-    class MockIndex(Mock):
-        def analyze(*args, **kwargs):
-            return []
-
-    index = MockIndex()
-
-    def search_words(self, words, fields, required=None, book=True, picture=False):
-        from .index import SearchResult
-
-        max_results = 20
-        
-        if picture: return []
-
-        qs = Book.objects.filter(findable=True).order_by('?')
-        results = []
-        for book in qs[:randint(1, max_results)]:
-            doc = {
-                'score': randint(0, 100),
-                'book_id': book.pk,
-                'published_date': randint(1000, 1920),
-                }
-            res = SearchResult(doc, how_found='mock', query_terms=words)
-            results.append(res)
-        return results
-
diff --git a/src/search/tests/index.py b/src/search/tests/index.py

index 34d9586..d63bafb 100644 (file)
--- a/src/search/tests/index.py
+++ b/src/search/tests/index.py
@@ -1,40 +1,27 @@
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
-from unittest import skipIf
  from django.conf import settings
  from django.test.utils import override_settings
  from catalogue.test_utils import WLTestCase, get_fixture
  from django.conf import settings
  from django.test.utils import override_settings
  from catalogue.test_utils import WLTestCase, get_fixture
-import tempfile
  from catalogue.models import Book
  from catalogue.models import Book
-from search.index import Index, Search
  import catalogue
  import opds
  
  
  import catalogue
  import opds
  
  
-@override_settings(SEARCH_INDEX=tempfile.mkdtemp(prefix='djangotest_search_'))
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False),
-        'Requires search server and NO_SEARCH_INDEX=False.')
  class BookSearchTests(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
  
  class BookSearchTests(WLTestCase):
      def setUp(self):
          WLTestCase.setUp(self)
  
-        index = Index()
-        self.search = Search()
-        index.delete_query(self.search.index.query(uid="*"))
-        index.index.commit()
+        with override_settings(NO_SEARCH_INDEX=False):
+            self.do_doktora = Book.from_xml_file(
+                get_fixture('do-doktora.xml', opds))
+            self.do_anusie = Book.from_xml_file(
+                get_fixture('fraszka-do-anusie.xml', catalogue))
  
  
-        self.do_doktora = Book.from_xml_file(
-            get_fixture('do-doktora.xml', opds))
-        self.do_anusie = Book.from_xml_file(
-            get_fixture('fraszka-do-anusie.xml', catalogue))
-
-    # TODO: Add slop option to sunburnt
-    # def test_search_perfect_parts(self):
-    #     books = self.search.search_phrase("Jakoż hamować")
-    #     assert len(books) == 2
-    #     for b in books:
-    #         b.book_id == self.book.id
-    #     a = SearchResult.aggregate(books)
-    #     # just one fragment hit.
-    #     assert len(a[0].hits) == 1
+    def test_search_perfect_parts(self):
+        response = self.client.get('/szukaj/?q=Jakoż hamować')
+        res = response.context['results']
+        self.assertEqual(len(res['snippet']), 1)
+        for b, s in res['snippet'].items():
+             self.assertEqual(b.id, self.do_anusie.id)
diff --git a/src/search/urls.py b/src/search/urls.py

index 46e73c5..1e66d33 100644 (file)
--- a/src/search/urls.py
+++ b/src/search/urls.py
@@ -6,6 +6,6 @@ from . import views
  
  
  urlpatterns = [
  
  
  urlpatterns = [
-    path('', views.main, name='wlsearch'),
+    path('', views.search, name='wlsearch'),
      path('hint/', views.hint, name='search_hint'),
  ]
      path('hint/', views.hint, name='search_hint'),
  ]
diff --git a/src/search/utils.py b/src/search/utils.py

index 6c0acf5..77ff1ae 100644 (file)
--- a/src/search/utils.py
+++ b/src/search/utils.py
@@ -1,3 +1,4 @@
+from django.conf import settings
  from django.db.models import Func
  from django.contrib.postgres.search import SearchQuery, SearchVectorField
  
  from django.db.models import Func
  from django.contrib.postgres.search import SearchQuery, SearchVectorField
  
@@ -8,7 +9,8 @@ class UnaccentSearchQuery(SearchQuery):
      '''
      def as_sql(self, *args, **kwargs):
          sql, params = super().as_sql(*args, **kwargs)
      '''
      def as_sql(self, *args, **kwargs):
          sql, params = super().as_sql(*args, **kwargs)
-        sql = f'unaccent({sql}::text)::tsquery'
+        if settings.SEARCH_USE_UNACCENT:
+            sql = f'unaccent({sql}::text)::tsquery'
          return sql, params
  
  
          return sql, params
  
  
@@ -19,10 +21,11 @@ class UnaccentSearchVector(Func):
      But user enters 'roze' -> stem leaves it as is, so we need original form in the vector.
      '''
      function='to_tsvector'
      But user enters 'roze' -> stem leaves it as is, so we need original form in the vector.
      '''
      function='to_tsvector'
-    template = '''unaccent(
-      %(function)s('polish', %(expressions)s)::text)::tsvector ||
-     to_tsvector(
-       'polish_simple', 
-       unaccent(%(expressions)s)
-     )'''
+    if settings.SEARCH_USE_UNACCENT:
+        template = f'''unaccent(
+        %(function)s('{settings.SEARCH_CONFIG}', %(expressions)s)::text)::tsvector ||
+        to_tsvector(
+        '{settings.SEARCH_CONFIG_SIMPLE}', 
+        unaccent(%(expressions)s)
+        )'''
      output_field = SearchVectorField()
      output_field = SearchVectorField()
diff --git a/src/search/views.py b/src/search/views.py

index b5cc0ba..e5ea598 100644 (file)
--- a/src/search/views.py
+++ b/src/search/views.py
@@ -2,30 +2,18 @@
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  from django.conf import settings
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  from django.conf import settings
-from django.http.response import HttpResponseRedirect
  from django.shortcuts import render
  from django.views.decorators import cache
  from django.http import HttpResponse, JsonResponse
  
  from catalogue.models import Book, Tag
  from django.shortcuts import render
  from django.views.decorators import cache
  from django.http import HttpResponse, JsonResponse
  
  from catalogue.models import Book, Tag
-from pdcounter.models import Author
-from picture.models import Picture
-from search.index import Search, SearchResult, PictureResult
  from .forms import SearchFilters
  from .forms import SearchFilters
-from suggest.forms import PublishingSuggestForm
  import re
  import json
  
  from wolnelektury.utils import re_escape
  
  
  import re
  import json
  
  from wolnelektury.utils import re_escape
  
  
-def match_word_re(word):
-    if 'sqlite' in settings.DATABASES['default']['ENGINE']:
-        return r"\b%s\b" % word
-    elif 'mysql' in settings.DATABASES['default']['ENGINE']:
-        return "[[:<:]]%s[[:>:]]" % word
-
-
  query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  
  
  query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
  
  
@@ -33,32 +21,6 @@ def remove_query_syntax_chars(query, replace=' '):
      return query_syntax_chars.sub(replace, query)
  
  
      return query_syntax_chars.sub(replace, query)
  
  
-def did_you_mean(query, tokens):
-    return query
-    # change = {}
-    # for t in tokens:
-    #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
-    #     if len(authors) > 0:
-    #         continue
-
-    #     if False:
-    #         if not dictionary.check(t):
-    #             try:
-    #                 change_to = dictionary.suggest(t)[0].lower()
-    #                 if change_to != t.lower():
-    #                     change[t] = change_to
-    #             except IndexError:
-    #                 pass
-
-    # if change == {}:
-    #     return None
-
-    # for frm, to in change.items():
-    #     query = query.replace(frm, to)
-
-    # return query
-
-
  @cache.never_cache
  def hint(request, mozhint=False, param='term'):
      prefix = request.GET.get(param, '')
  @cache.never_cache
  def hint(request, mozhint=False, param='term'):
      prefix = request.GET.get(param, '')
@@ -133,212 +95,3 @@ def search(request):
                  ctx['hasresults'] = True
                  break
      return render(request, 'search/results.html', ctx)
                  ctx['hasresults'] = True
                  break
      return render(request, 'search/results.html', ctx)
-
-
-@cache.never_cache
-def main(request):
-    if request.EXPERIMENTS['layout'].value:
-        return search(request)
-
-    query = request.GET.get('q', '')
-
-    format = request.GET.get('format')
-    lang = request.GET.get('lang')
-    epoch = request.GET.get('epoch')
-    kind = request.GET.get('kind')
-    genre = request.GET.get('genre')
-
-    if len(query) < 2:
-        return render(
-            request, 'catalogue/search_too_short.html',
-            {'prefix': query})
-    elif len(query) > 256:
-        return render(
-            request, 'catalogue/search_too_long.html',
-            {'prefix': query})
-
-    query = prepare_query(query)
-    if not (format or lang or epoch or kind or genre):
-        pd_authors = search_pd_authors(query)
-    else:
-        pd_authors = []
-    if not format or format != 'obraz':
-        books = search_books(
-            query,
-            lang=lang,
-            only_audio=format=='audio',
-            only_synchro=format=='synchro',
-            epoch=epoch,
-            kind=kind,
-            genre=genre
-        )
-    else:
-        books = []
-    if (not format or format == 'obraz') and not lang:
-        pictures = search_pictures(
-            query,
-            epoch=epoch,
-            kind=kind,
-            genre=genre
-        )
-    else:
-        pictures = []
-    
-    suggestion = ''
-
-    if not (books or pictures or pd_authors):
-        form = PublishingSuggestForm(initial={"books": query + ", "})
-        return render(
-            request,
-            'catalogue/search_no_hits.html',
-            {
-                'form': form,
-                'did_you_mean': suggestion
-            })
-
-    if not (books or pictures) and len(pd_authors) == 1:
-        return HttpResponseRedirect(pd_authors[0].get_absolute_url())
-
-    return render(
-        request,
-        'catalogue/search_multiple_hits.html',
-        {
-            'pd_authors': pd_authors,
-            'books': books,
-            'pictures': pictures,
-            'did_you_mean': suggestion,
-            'set': {
-                'lang': lang,
-                'format': format,
-                'epoch': epoch,
-                'kind': kind,
-                'genre': genre,
-            },
-            'tags': {
-                'epoch': Tag.objects.filter(category='epoch', for_books=True),
-                'genre': Tag.objects.filter(category='genre', for_books=True),
-                'kind': Tag.objects.filter(category='kind', for_books=True),
-            },
-        })
-
-def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
-    search = Search()
-    results_parts = []
-    search_fields = []
-    words = query.split()
-    fieldsets = (
-        (['authors', 'authors_nonstem'], True),
-        (['title', 'title_nonstem'], True),
-        (['metadata', 'metadata_nonstem'], True),
-        (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
-    )
-    for fields, is_book in fieldsets:
-        search_fields += fields
-        results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
-    results = []
-    ids_results = {}
-    for results_part in results_parts:
-        for result in sorted(SearchResult.aggregate(results_part), reverse=True):
-            book_id = result.book_id
-            if book_id in ids_results:
-                ids_results[book_id].merge(result)
-            else:
-                results.append(result)
-                ids_results[book_id] = result
-    descendant_ids = set(
-        Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
-    results = [result for result in results if result.book_id not in descendant_ids]
-    for result in results:
-        search.get_snippets(result, query, num=3)
-
-    def ensure_exists(r):
-        try:
-            if not r.book:
-                return False
-        except Book.DoesNotExist:
-            return False
-
-        if lang and r.book.language != lang:
-            return False
-        if only_audio and not r.book.has_mp3_file():
-            return False
-        if only_synchro and not r.book.has_daisy_file():
-            return False
-        if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
-            return False
-        if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
-            return False
-        if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
-            return False
-
-        return True
-
-    results = [r for r in results if ensure_exists(r)]
-    return results
-
-
-def search_pictures(query, epoch=None, kind=None, genre=None):
-    search = Search()
-    results_parts = []
-    search_fields = []
-    words = query.split()
-    fieldsets = (
-        (['authors', 'authors_nonstem'], True),
-        (['title', 'title_nonstem'], True),
-        (['metadata', 'metadata_nonstem'], True),
-        (['themes_pl', 'themes_pl_nonstem'], False),
-    )
-    for fields, is_book in fieldsets:
-        search_fields += fields
-        results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
-    results = []
-    ids_results = {}
-    for results_part in results_parts:
-        for result in sorted(PictureResult.aggregate(results_part), reverse=True):
-            picture_id = result.picture_id
-            if picture_id in ids_results:
-                ids_results[picture_id].merge(result)
-            else:
-                results.append(result)
-                ids_results[picture_id] = result
-
-    def ensure_exists(r):
-        try:
-            if not r.picture:
-                return False
-        except Picture.DoesNotExist:
-            return False
-
-        if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
-            return False
-        if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
-            return False
-        if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
-            return False
-
-        return True
-
-    results = [r for r in results if ensure_exists(r)]
-    return results
-
-
-def search_pd_authors(query):
-    pd_authors = Author.objects.filter(name__icontains=query)
-    existing_slugs = Tag.objects.filter(
-        category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
-        .values_list('slug', flat=True)
-    pd_authors = pd_authors.exclude(slug__in=existing_slugs)
-    return pd_authors
-
-
-def prepare_query(query):
-    query = ' '.join(query.split())
-    # filter out private use characters
-    import unicodedata
-    query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
-    query = remove_query_syntax_chars(query)
-
-    words = query.split()
-    if len(words) > 10:
-        query = ' '.join(words[:10])
-    return query
diff --git a/src/wolnelektury/settings/__init__.py b/src/wolnelektury/settings/__init__.py

index 113c1e7..f772d3d 100644 (file)
--- a/src/wolnelektury/settings/__init__.py
+++ b/src/wolnelektury/settings/__init__.py
@@ -29,15 +29,6 @@ except NameError:
      CELERY_TASK_ALWAYS_EAGER = True
  
  
      CELERY_TASK_ALWAYS_EAGER = True
  
  
-# If SEARCH_INDEX not configured, disable the search.
-try:
-    SOLR
-except NameError:
-    NO_SEARCH_INDEX = True
-else:
-    NO_SEARCH_INDEX = False
-
-
  try:
      SENTRY_DSN
  except NameError:
  try:
      SENTRY_DSN
  except NameError:
diff --git a/src/wolnelektury/settings/basic.py b/src/wolnelektury/settings/basic.py

index bbf684f..413adbe 100644 (file)
--- a/src/wolnelektury/settings/basic.py
+++ b/src/wolnelektury/settings/basic.py
@@ -28,9 +28,6 @@ DATABASES = {
  
  DEFAULT_AUTO_FIELD = 'django.db.models.AutoField'
  
  
  DEFAULT_AUTO_FIELD = 'django.db.models.AutoField'
  
-SOLR_TEST = "http://localhost:8983/solr/wl_test/"
-SOLR_STOPWORDS = "/path/to/solr/data/conf/lang/stopwords_pl.txt"
-
  # Local time zone for this installation. Choices can be found here:
  # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
  # although not all choices may be available on all operating systems.
  # Local time zone for this installation. Choices can be found here:
  # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
  # although not all choices may be available on all operating systems.
diff --git a/src/wolnelektury/settings/custom.py b/src/wolnelektury/settings/custom.py

index f7fca47..51a5613 100644 (file)
--- a/src/wolnelektury/settings/custom.py
+++ b/src/wolnelektury/settings/custom.py
@@ -68,6 +68,9 @@ CIVICRM_ACTIVITIES = {
  
  EXPERIMENTS_LAYOUT = 1
  EXPERIMENTS_SOWKA = 0
  
  EXPERIMENTS_LAYOUT = 1
  EXPERIMENTS_SOWKA = 0
-EXPERIMENTS_SEARCH = 0
  
  WIDGETS = {}
  
  WIDGETS = {}
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False
diff --git a/src/wolnelektury/settings/static.py b/src/wolnelektury/settings/static.py

index 179467d..97dcec4 100644 (file)
--- a/src/wolnelektury/settings/static.py
+++ b/src/wolnelektury/settings/static.py
@@ -8,7 +8,6 @@ from .paths import VAR_DIR
  # Example: "/home/media/media.lawrence.com/"
  MEDIA_ROOT = path.join(VAR_DIR, 'media/')
  STATIC_ROOT = path.join(VAR_DIR, 'static/')
  # Example: "/home/media/media.lawrence.com/"
  MEDIA_ROOT = path.join(VAR_DIR, 'media/')
  STATIC_ROOT = path.join(VAR_DIR, 'static/')
-SEARCH_INDEX = path.join(VAR_DIR, 'search_index/')
  
  # URL that handles the media served from MEDIA_ROOT. Make sure to use a
  # trailing slash if there is a path component (optional in other cases).
  
  # URL that handles the media served from MEDIA_ROOT. Make sure to use a
  # trailing slash if there is a path component (optional in other cases).
diff --git a/src/wolnelektury/settings/test.py b/src/wolnelektury/settings/test.py

index 57718b8..0e10be9 100644 (file)
--- a/src/wolnelektury/settings/test.py
+++ b/src/wolnelektury/settings/test.py
@@ -6,3 +6,7 @@ from wolnelektury.settings import *
  THUMBNAIL_BACKEND = 'wolnelektury.test_utils.DummyThumbnailBackend'
  CATALOGUE_GET_MP3_LENGTH = 'catalogue.test_utils.get_mp3_length'
  MEDIA_URL = '/media/'
  THUMBNAIL_BACKEND = 'wolnelektury.test_utils.DummyThumbnailBackend'
  CATALOGUE_GET_MP3_LENGTH = 'catalogue.test_utils.get_mp3_length'
  MEDIA_URL = '/media/'
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False
author	Radek Czajka <rczajka@rczajka.pl>
	Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
committer	Radek Czajka <rczajka@rczajka.pl>
	Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
.gitignore		patch \| blob \| history
requirements/requirements.txt		patch \| blob \| history
src/api/tests/res/responses/collection.json		patch \| blob \| history
src/catalogue/management/commands/importbooks.py		patch \| blob \| history
src/catalogue/models/book.py		patch \| blob \| history
src/catalogue/signals.py		patch \| blob \| history
src/catalogue/tasks.py		patch \| blob \| history
src/catalogue/templates/catalogue/search_multiple_hits.html	[deleted file]	patch \| blob \| history
src/catalogue/templates/catalogue/search_no_hits.html	[deleted file]	patch \| blob \| history
src/catalogue/templates/catalogue/search_too_long.html	[deleted file]	patch \| blob \| history
src/catalogue/templates/catalogue/search_too_short.html	[deleted file]	patch \| blob \| history
src/catalogue/test_utils.py		patch \| blob \| history
src/catalogue/tests/test_book_import.py		patch \| blob \| history
src/catalogue/tests/test_bookmedia.py		patch \| blob \| history
src/catalogue/tests/test_tags.py		patch \| blob \| history
src/club/forms.py		patch \| blob \| history
src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py		patch \| blob \| history
src/opds/tests/test_opds.py		patch \| blob \| history
src/opds/views.py		patch \| blob \| history
src/pdcounter/models.py		patch \| blob \| history
src/picture/models.py		patch \| blob \| history
src/picture/tasks.py		patch \| blob \| history
src/search/custom.py	[deleted file]	patch \| blob \| history
src/search/forms.py		patch \| blob \| history
src/search/index.py		patch \| blob \| history
src/search/management/__init__.py	[deleted file]	patch \| blob \| history
src/search/management/commands/__init__.py	[deleted file]	patch \| blob \| history
src/search/management/commands/reindex.py	[deleted file]	patch \| blob \| history
src/search/management/commands/reindex_pictures.py	[deleted file]	patch \| blob \| history
src/search/management/commands/snippets.py	[deleted file]	patch \| blob \| history
src/search/mock_search.py	[deleted file]	patch \| blob \| history
src/search/tests/index.py		patch \| blob \| history
src/search/urls.py		patch \| blob \| history
src/search/utils.py		patch \| blob \| history
src/search/views.py		patch \| blob \| history
src/wolnelektury/settings/__init__.py		patch \| blob \| history
src/wolnelektury/settings/basic.py		patch \| blob \| history
src/wolnelektury/settings/custom.py		patch \| blob \| history
src/wolnelektury/settings/static.py		patch \| blob \| history
src/wolnelektury/settings/test.py		patch \| blob \| history