Remove legacy search.
authorRadek Czajka <rczajka@rczajka.pl>
Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
committerRadek Czajka <rczajka@rczajka.pl>
Thu, 15 Jun 2023 10:48:46 +0000 (12:48 +0200)
40 files changed:
.gitignore
requirements/requirements.txt
src/api/tests/res/responses/collection.json
src/catalogue/management/commands/importbooks.py
src/catalogue/models/book.py
src/catalogue/signals.py
src/catalogue/tasks.py
src/catalogue/templates/catalogue/search_multiple_hits.html [deleted file]
src/catalogue/templates/catalogue/search_no_hits.html [deleted file]
src/catalogue/templates/catalogue/search_too_long.html [deleted file]
src/catalogue/templates/catalogue/search_too_short.html [deleted file]
src/catalogue/test_utils.py
src/catalogue/tests/test_book_import.py
src/catalogue/tests/test_bookmedia.py
src/catalogue/tests/test_tags.py
src/club/forms.py
src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py
src/opds/tests/test_opds.py
src/opds/views.py
src/pdcounter/models.py
src/picture/models.py
src/picture/tasks.py
src/search/custom.py [deleted file]
src/search/forms.py
src/search/index.py
src/search/management/__init__.py [deleted file]
src/search/management/commands/__init__.py [deleted file]
src/search/management/commands/reindex.py [deleted file]
src/search/management/commands/reindex_pictures.py [deleted file]
src/search/management/commands/snippets.py [deleted file]
src/search/mock_search.py [deleted file]
src/search/tests/index.py
src/search/urls.py
src/search/utils.py
src/search/views.py
src/wolnelektury/settings/__init__.py
src/wolnelektury/settings/basic.py
src/wolnelektury/settings/custom.py
src/wolnelektury/settings/static.py
src/wolnelektury/settings/test.py

index 99c0898..3d746f9 100644 (file)
@@ -21,6 +21,7 @@ coverage.xml
 pip-log.txt
 nosetests.xml
 /htmlcov
+.python-version
 
 # Mac OS X garbage
 .DS_Store
index 99e11fa..86afdfc 100644 (file)
@@ -49,9 +49,6 @@ celery[redis]==5.2.7
 #pyoai==2.5.1
 -e git+https://github.com/infrae/pyoai@5ff2f15e869869e70d8139e4c37b7832854d7049#egg=pyoai
 
-scorched==0.13
-httplib2
-
 sentry-sdk==0.10.2
 
 requests
index 2992461..0e385dc 100644 (file)
@@ -1,5 +1,6 @@
 {
     "url": "http://testserver/katalog/lektury/a-collection/",
+    "authors": [],
     "books": [
         {
             "kind": "Liryka", 
index e9b3364..9322eea 100644 (file)
@@ -12,7 +12,6 @@ from librarian.picture import ImageStore
 
 from catalogue.models import Book
 from picture.models import Picture
-from search.index import Index
 
 
 class Command(BaseCommand):
@@ -28,10 +27,6 @@ class Command(BaseCommand):
         parser.add_argument(
                 '-D', '--dont-build', dest='dont_build', metavar="FORMAT,...",
                 help="Skip building specified formats")
-        parser.add_argument(
-                '-S', '--no-search-index', action='store_false',
-                dest='search_index', default=True,
-                help='Skip indexing imported works for search')
         parser.add_argument(
                 '-F', '--not-findable', action='store_false',
                 dest='findable', default=True,
@@ -50,7 +45,6 @@ class Command(BaseCommand):
         file_base, ext = os.path.splitext(file_path)
         book = Book.from_xml_file(file_path, overwrite=options.get('force'),
                                   dont_build=dont_build,
-                                  search_index_tags=False,
                                   findable=options.get('findable'),
                                   remote_gallery_url='file://' + os.path.dirname(os.path.abspath(file_base)) + '/img/'
                                   )
@@ -84,15 +78,6 @@ class Command(BaseCommand):
         verbose = options.get('verbose')
         import_picture = options.get('import_picture')
 
-        if options.get('search_index') and not settings.NO_SEARCH_INDEX:
-            index = Index()
-            try:
-                index.index_tags()
-                index.index.commit()
-            except Exception as e:
-                index.index.rollback()
-                raise e
-
         files_imported = 0
         files_skipped = 0
 
index 85cfd63..bcbefea 100644 (file)
@@ -529,21 +529,11 @@ class Book(models.Model):
         })
         return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme})
 
-    def search_index(self, book_info=None, index=None, index_tags=True, commit=True):
+    def search_index(self, index=None):
         if not self.findable:
             return
-        if index is None:
-            from search.index import Index
-            index = Index()
-        try:
-            index.index_book(self, book_info)
-            if index_tags:
-                index.index_tags()
-            if commit:
-                index.index.commit()
-        except Exception as e:
-            index.index.rollback()
-            raise e
+        from search.index import Index
+        Index.index_book(self)
 
     # will make problems in conjunction with paid previews
     def download_pictures(self, remote_gallery_url):
@@ -603,7 +593,7 @@ class Book(models.Model):
 
     @classmethod
     def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
-                           search_index_tags=True, remote_gallery_url=None, days=0, findable=True):
+                           remote_gallery_url=None, days=0, findable=True):
         from catalogue import tasks
 
         if dont_build is None:
@@ -712,7 +702,7 @@ class Book(models.Model):
                 getattr(book, '%s_file' % format_).build_delay()
 
         if not settings.NO_SEARCH_INDEX and search_index and findable:
-            tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags)
+            tasks.index_book.delay(book.id)
 
         for child in notify_cover_changed:
             child.parent_cover_changed()
index 72f8a89..81c0b9c 100644 (file)
@@ -53,13 +53,6 @@ def book_save(sender, instance, **kwargs):
 def book_delete(sender, instance, **kwargs):
     caches[settings.CACHE_MIDDLEWARE_ALIAS].clear()
 
-    if not settings.NO_SEARCH_INDEX:
-        # remove the book from search index, when it is deleted.
-        from search.index import Index
-        idx = Index()
-        idx.remove_book(instance)
-        idx.index_tags()
-
 
 ####
 # Tag
index b2308bb..0694b01 100644 (file)
@@ -32,9 +32,9 @@ def build_field(pk, field_name):
 
 
 @shared_task
-def index_book(book_id, book_info=None, **kwargs):
+def index_book(book_id, **kwargs):
     try:
-        return Book.objects.get(id=book_id).search_index(book_info, **kwargs)
+        return Book.objects.get(id=book_id).search_index(**kwargs)
     except Exception as e:
         print("Exception during index: %s" % e)
         print_exc()
diff --git a/src/catalogue/templates/catalogue/search_multiple_hits.html b/src/catalogue/templates/catalogue/search_multiple_hits.html
deleted file mode 100644 (file)
index 937b926..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load pagination_tags %}
-{% load inline_tag_list from catalogue_tags %}
-{% load book_searched from search_tags %}
-{% load set_get_parameter %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-
-  <h1>{% trans "Search" %}</h1>
-
-  <div class="white-box">
-
-    <p class="search-filter">
-      <strong>format:</strong>
-      {% if not set.format %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter format='' %}">dowolny</a>{% endif %}
-      {% if set.format == "tekst" %}<em>tekst</em>{% else %}<a href="{% set_get_parameter format='tekst' %}">tekst</a>{% endif %}
-      {% if set.format == "audio" %}<em>audiobook</em>{% else %}<a href="{% set_get_parameter format='audio' %}">audiobook</a>{% endif %}
-      {% if set.format == "synchro" %}<em>DAISY</em>{% else %}<a href="{% set_get_parameter format='synchro' %}">DAISY</a>{% endif %}
-      {% if set.format == "obraz" %}<em>obraz</em>{% else %}<a href="{% set_get_parameter format='obraz' %}">obraz</a>{% endif %}
-    </p>
-
-    <p class="search-filter">
-      <strong>{% trans "language" %}: </strong>
-      {% if not set.lang %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter lang='' %}">dowolny</a>{% endif %}
-      {% if set.lang == "pol" %}<em>polski</em>{% else %}<a href="{% set_get_parameter lang='pol' %}">polski</a>{% endif %}
-      {% if set.lang == "eng" %}<em>angielski</em>{% else %}<a href="{% set_get_parameter lang='eng' %}">angielski</a>{% endif %}
-      {% if set.lang == "fre" %}<em>francuski</em>{% else %}<a href="{% set_get_parameter lang='fre' %}">francuski</a>{% endif %}
-      {% if set.lang == "ger" %}<em>niemiecki</em>{% else %}<a href="{% set_get_parameter lang='ger' %}">niemiecki</a>{% endif %}
-      {% if set.lang == "lit" %}<em>litewski</em>{% else %}<a href="{% set_get_parameter lang='lit' %}">litewski</a>{% endif %}
-    </p>
-
-    </p>
-    <p class="search-filter">
-      <strong>{% trans "epoch" %}: </strong>
-      {% if not set.epoch %}<em>dowolna</em>{% else %}<a href="{% set_get_parameter epoch='' %}">dowolna</a>{% endif %}
-
-      {% for tag in tags.epoch %}
-        {% if set.epoch == tag.slug %}
-          <em>{{ tag.name }}</em>
-        {% else %}
-          <a href="{% set_get_parameter epoch=tag.slug %}">
-            {{ tag.name }}
-          </a>
-        {% endif %}
-      {% endfor %}
-    </p>
-    <p class="search-filter">
-      <strong>{% trans "kind" %}: </strong>
-      {% if not set.kind %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter kind='' %}">dowolny</a>{% endif %}
-      {% for tag in tags.kind %}
-        {% if set.kind == tag.slug %}
-          <em>{{ tag.name }}</em>
-        {% else %}
-          <a href="{% set_get_parameter kind=tag.slug %}">
-            {{ tag.name }}
-          </a>
-        {% endif %}
-      {% endfor %}
-    </p>
-
-    {% comment %}
-    <p class="search-filter">
-      <strong>{% trans "genre" %}: </strong>
-      {% if not set.genre %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter genre='' %}">dowolny</a>{% endif %}
-      {% for tag in tags.genre %}
-          {% if set.genre == tag.slug %}
-            <em>{{ tag.name }}</em>
-          {% else %}
-            <a href="{% set_get_parameter genre=tag.slug %}">
-              {{ tag.name }}
-            </a>
-          {% endif %}
-        {% endfor %}
-    </p>
-    {% endcomment %}
-  </div>
-
-  {% if did_you_mean %}
-    <span class="did_you_mean">{% trans "Did you mean" %}
-      <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
-  {% endif %}
-  <div class="top-tag-list">
-    {% if pd_authors %}
-      <div>
-        {% for author in pd_authors %}
-          <div class="tag-box">
-            {% include "pdcounter/pd_author_box.html" %}
-          </div>
-        {% endfor %}
-      </div>
-    {% endif %}
-  </div>
-
-  <div>
-    {% if books %}
-      <ul class="work-list">
-        {% if pictures %}
-          <h1>{% trans "Books" %}</h1>
-        {% endif %}
-        {% for result in books %}
-          <li class="Book-item">
-            <div class="search-result">
-              {% book_searched result %}
-            </div>
-          </li>
-        {% endfor %}
-      </ul>
-    {% endif %}
-
-    {% if pictures %}
-      <h1>{% trans "Art" %}</h1>
-      <ul class="work-list">
-        {% for result in pictures %}
-          <li class="Picture-item">
-            <div class="search-result">
-              {% with result.picture as picture %}
-                {% include "picture/picture_searched.html" %}
-              {% endwith %}
-            </div>
-          </li>
-        {% endfor %}
-      </ul>
-    {% endif %}
-  </div>
-{% endblock %}
diff --git a/src/catalogue/templates/catalogue/search_no_hits.html b/src/catalogue/templates/catalogue/search_no_hits.html
deleted file mode 100644 (file)
index 3f9e982..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div class="left-column">
-    <div class="normal-text">
-      <p>
-        {% if did_you_mean %}
-          <span class="did_you_mean">{% trans "Did you mean" %}
-            <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
-        {% endif %}
-      </p>
-      <p>{% trans "Sorry! Search cirteria did not match any resources." %}</p>
-
-      {% include "info/join_us.html" %}
-    </div>
-  </div>
-
-  <div class="right-column">
-    {% include "publishing_suggest.html" %}
-  </div>
-{% endblock %}
diff --git a/src/catalogue/templates/catalogue/search_too_long.html b/src/catalogue/templates/catalogue/search_too_long.html
deleted file mode 100644 (file)
index 4f780df..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div id="books-list">
-    <p>{% trans "Sorry! Search query is too long to be processed." %}</p>
-    {% include "info/join_us.html" %}
-  </div>
-{% endblock %}
\ No newline at end of file
diff --git a/src/catalogue/templates/catalogue/search_too_short.html b/src/catalogue/templates/catalogue/search_too_short.html
deleted file mode 100644 (file)
index 253a94b..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-  <h1>{% trans "Search" %}</h1>
-
-  <div id="books-list">
-    <p>{% trans "Sorry! Search query must have at least two characters." %}</p>
-    {% include "info/join_us.html" %}
-  </div>
-{% endblock %}
\ No newline at end of file
index 6bc5569..c15cba7 100644 (file)
@@ -19,7 +19,6 @@ from librarian import WLURI
     CACHES={
         'default': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
     },
-    SOLR=settings.SOLR_TEST,
 )
 class WLTestCase(TestCase):
     """
@@ -74,7 +73,7 @@ def info_args(title, language=None):
         language = 'pol'
     return {
         'title': str(title),
-        'url': WLURI.from_slug(slug),
+        'url': WLURI(slug),
         'about': "http://wolnelektury.pl/example/URI/%s" % slug,
         'language': language,
     }
index f8900c8..5f9627f 100644 (file)
@@ -14,7 +14,7 @@ class BookImportLogicTests(WLTestCase):
     def setUp(self):
         WLTestCase.setUp(self)
         self.book_info = BookInfoStub(
-            url=WLURI.from_slug("default-book"),
+            url=WLURI("default-book"),
             about="http://wolnelektury.pl/example/URI/default_book",
             title="Default Book",
             author=PersonStub(("Jim",), "Lazy"),
@@ -121,7 +121,7 @@ class BookImportLogicTests(WLTestCase):
 
     def test_book_with_invalid_slug(self):
         """ Book with invalid characters in slug shouldn't be imported """
-        self.book_info.url = WLURI.from_slug("default_book")
+        self.book_info.url = WLURI("default_book")
         book_text = "<utwor />"
         with self.assertRaises(ValueError):
             models.Book.from_text_and_meta(ContentFile(book_text), self.book_info)
@@ -375,7 +375,7 @@ class TreeImportTest(WLTestCase):
 class MultilingualBookImportTest(WLTestCase):
     def setUp(self):
         WLTestCase.setUp(self)
-        common_uri = WLURI.from_slug('common-slug')
+        common_uri = WLURI('common-slug')
 
         self.pol_info = BookInfoStub(
             genre='X-Genre',
index edd17a5..7952826 100644 (file)
@@ -4,7 +4,7 @@
 from os.path import basename, exists
 from unittest import skip
 
-from django.core.files.base import ContentFile
+from django.core.files.base import ContentFile, File
 
 from catalogue.test_utils import *
 from catalogue import models, utils
@@ -17,6 +17,8 @@ class BookMediaTests(WLTestCase):
         self.file = ContentFile(b'X')
         self.file2 = ContentFile(b'Y')
         self.book = models.Book.objects.create(slug='test-book', title='Test')
+        with open(join(dirname(__file__), "files/fraszka-do-anusie.xml")) as f:
+            self.book.xml_file.save(None, File(f))
 
     def set_title(self, title):
         self.book.title = title
index a706618..0853a42 100644 (file)
@@ -107,73 +107,88 @@ class TagRelatedTagsTests(WLTestCase):
     def test_empty(self):
         """ empty tag should have no related tags """
 
-        cats = self.client.get('/katalog/autor/empty/').context['categories']
-        self.assertEqual({k: v for (k, v) in cats.items() if v}, {}, 'tags related to empty tag')
+        suggested = self.client.get('/katalog/autor/empty/').context['suggested_tags']
+        self.assertEqual(suggested, [], 'tags related to empty tag')
 
     def test_has_related(self):
         """ related own and descendants' tags should be generated """
 
-        cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
-        self.assertTrue('Common Man' in [tag.name for tag in cats['author']],
+        suggested = {
+            (t.name, t.category)
+            for t in self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+        }
+        self.assertTrue(('Common Man', 'author') in suggested,
                         'missing `author` related tag')
-        self.assertTrue('Epoch' in [tag.name for tag in cats['epoch']],
+        self.assertTrue(('Epoch', 'epoch') in suggested,
                         'missing `epoch` related tag')
-        self.assertFalse(cats.get("kind", False),
+        # TODO: this should probably be changed now.
+        self.assertFalse(any(x for x in suggested if x[1] == "kind"),
                          "There should be no child-only related `kind` tags")
-        self.assertTrue("Genre" in [tag.name for tag in cats['genre']],
+        self.assertTrue(("Genre", 'genre') in suggested,
                         'missing `genre` related tag')
-        self.assertFalse("ChildGenre" in [tag.name for tag in cats['genre']],
+        # TODO: this should probably be changed now.
+        self.assertFalse(("ChildGenre", 'genre') in suggested,
                          "There should be no child-only related `genre` tags")
-        self.assertTrue("GchildGenre" in [tag.name for tag in cats['genre']],
+        self.assertTrue(("GchildGenre", "genre") in suggested,
                         "missing grandchild's related tag")
-        self.assertTrue('Theme' in [tag.name for tag in cats['theme']],
+        self.assertTrue(('Theme', 'theme') in suggested,
                         "missing related theme")
-        self.assertFalse('Child1Theme' in [tag.name for tag in cats['theme']],
-                         "There should be no child-only related `theme` tags")
-        self.assertTrue('GChildTheme' in [tag.name for tag in cats['theme']],
+        self.assertTrue(('Child1Theme', 'theme') in suggested,
+                         "missing child's related theme")
+        self.assertTrue(('GChildTheme', 'theme') in suggested,
                         "missing grandchild's related theme")
 
     def test_related_differ(self):
         """ related tags shouldn't include filtering tags """
 
         response = self.client.get('/katalog/rodzaj/kind/')
-        cats = response.context['categories']
-        self.assertFalse(cats.get('kind', False),
+        suggested = response.context['suggested_tags']
+        self.assertFalse(any(x for x in suggested if x.category == 'kind'),
                          'filtering tag wrongly included in related')
-        cats = self.client.get('/katalog/motyw/theme/').context['categories']
-        self.assertFalse('Theme' in [tag.name for tag in cats['theme']],
+        suggested = {
+            (t.name, t.category)
+            for t in self.client.get(
+                    '/katalog/motyw/theme/').context['suggested_tags']
+        }
+        self.assertFalse(('Theme', 'theme') in suggested,
                          'filtering theme wrongly included in related')
 
     def test_parent_tag_once(self):
         """ if parent and descendants have a common tag, count it only once """
 
-        cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
-        self.assertEqual([(tag.name, tag.count) for tag in cats['epoch']],
+        suggested = self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+        self.assertEqual([(tag.name, tag.count) for tag in suggested if tag.category == 'epoch'],
                          [('Epoch', 1)],
                          'wrong related tag epoch tag on tag page')
 
     def test_siblings_tags_count(self):
         """ if children have tags and parent hasn't, count the children """
 
-        cats = self.client.get('/katalog/epoka/epoch/').context['categories']
+        suggested = self.client.get('/katalog/epoka/epoch/').context['suggested_tags']
+        kinds = [(tag.name, tag.count) for tag in suggested if tag.category == 'kind']
         self.assertTrue(
-            ('ChildKind', 2) in [(tag.name, tag.count) for tag in cats['kind']],
-            'wrong related kind tags on tag page, got: ' +
-            str([(tag.name, tag.count) for tag in cats['kind']]))
+            ('ChildKind', 2) in kinds,
+            'wrong related kind tags on tag page'
+        )
 
         # all occurencies of theme should be counted
-        self.assertTrue(('Theme', 4) in [(tag.name, tag.count) for tag in cats['theme']],
-                        'wrong related theme count')
+        themes = [(tag.name, tag.count) for tag in suggested if tag.category == 'theme']
+        self.assertTrue(
+            ('Theme', 4) in themes,
+            'wrong related theme count'
+        )
 
     def test_query_child_tag(self):
         """
         If child and parent have a common tag, but parent isn't included
         in the result, child should still count.
         """
-        cats = self.client.get('/katalog/gatunek/childgenre/').context['categories']
-        self.assertTrue(('Epoch', 2) in [(tag.name, tag.count) for tag in cats['epoch']],
-                        'wrong related kind tags on tag page, got: ' +
-                        str([(tag.name, tag.count) for tag in cats['epoch']]))
+        suggested = self.client.get('/katalog/gatunek/childgenre/').context['suggested_tags']
+        epochs = [(tag.name, tag.count) for tag in suggested if tag.category == 'epoch']
+        self.assertTrue(
+            ('Epoch', 2) in epochs,
+            'wrong related kind tags on tag page'
+        )
 
 
 class CleanTagRelationTests(WLTestCase):
@@ -198,8 +213,8 @@ class CleanTagRelationTests(WLTestCase):
         """ there should be no related tags left after deleting some objects """
 
         models.Book.objects.all().delete()
-        cats = self.client.get('/katalog/rodzaj/k/').context['categories']
-        self.assertEqual({k: v for (k, v) in cats.items() if v}, {})
+        suggested = self.client.get('/katalog/rodzaj/k/').context['suggested_tags']
+        self.assertEqual(suggested, [])
         self.assertEqual(models.Fragment.objects.all().count(), 0,
                          "orphaned fragments left")
         self.assertEqual(models.Tag.intermediary_table_model.objects.all().count(), 0,
@@ -248,10 +263,11 @@ class TestIdenticalTag(WLTestCase):
                 self.book_info)
         categories = {'author': 'autor', 'theme': 'motyw', 'epoch': 'epoka', 'kind': 'rodzaj', 'genre': 'gatunek'}
         for cat, localcat in categories.items():
+            if cat == 'theme': continue
             context = self.client.get('/katalog/%s/tag/' % localcat).context
             self.assertEqual(1, len(context['object_list']))
-            self.assertNotEqual({}, context['categories'])
-            self.assertFalse(context['categories'].get(cat, False))
+            self.assertNotEqual([], context['suggested_tags'])
+            self.assertFalse(any(t for t in context['suggested_tags'] if t.category == cat))
 
 
 class BookTagsTests(WLTestCase):
index 5c2d301..098a70e 100644 (file)
@@ -138,7 +138,8 @@ class DonationStep1Form(forms.ModelForm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         club = models.Club.objects.first()
-        self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
+        if club is not None:
+            self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
 
     def clean(self):
         state = {}
index 0c696b2..c73450c 100644 (file)
@@ -6,8 +6,9 @@ from django.db import migrations, models
 def last_amount_wide(apps, schema_editor):
     SingleAmount = apps.get_model('club', 'SingleAmount')
     a = SingleAmount.objects.last()
-    a.wide = True
-    a.save()
+    if a is not None:
+        a.wide = True
+        a.save()
 
 
 class Migration(migrations.Migration):
index 2c37bd4..e86b865 100644 (file)
@@ -1,26 +1,20 @@
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from unittest import skipIf
 from lxml import etree
 from django.conf import settings
 import catalogue
 from catalogue.test_utils import WLTestCase, get_fixture
 from catalogue.models import Book
 from librarian import WLURI, XMLNamespace
-from search.index import Index
 
 AtomNS = XMLNamespace("http://www.w3.org/2005/Atom")
 
 
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False), 'Requires search server and NO_SEARCH_INDEX=False.')
 class OpdsSearchTests(WLTestCase):
     """Tests search feed in OPDS.."""
     def setUp(self):
         WLTestCase.setUp(self)
-        index = Index()
-        index.index.delete_all()
-        index.index.commit()
 
         self.do_doktora = Book.from_xml_file(
             get_fixture('do-doktora.xml'))
@@ -32,7 +26,7 @@ class OpdsSearchTests(WLTestCase):
         tree = etree.fromstring(
             self.client.get('/opds/search/?%s' % query).content)
         elem_ids = tree.findall('.//%s/%s' % (AtomNS('entry'), AtomNS('id')))
-        slugs = [WLURI(elem.text).slug for elem in elem_ids]
+        slugs = [WLURI.from_text(elem.text).slug for elem in elem_ids]
         self.assertEqual(set(slugs), set(b.slug for b in books), "OPDS search '%s' failed." % query)
 
     def test_opds_search_simple(self):
index 8e929c6..63c79a2 100644 (file)
@@ -16,8 +16,8 @@ from django.utils.functional import lazy
 
 from basicauth import logged_in_or_basicauth, factory_decorator
 from catalogue.models import Book, Tag
+from search.utils import UnaccentSearchQuery, UnaccentSearchVector
 
-from search.views import Search
 import operator
 import logging
 import re
@@ -350,15 +350,6 @@ class SearchFeed(AcquisitionFeed):
         'text': (10, 11),
         }
 
-    PARAMS_TO_FIELDS = {
-        'author': 'authors',
-        'translator': 'translators',
-        #        'title': 'title',
-        'categories': 'tag_name_pl',
-        'description': 'text',
-        #        'text': 'text',
-        }
-
     ATOM_PLACEHOLDER = re.compile(r"^{(atom|opds):\w+}$")
 
     def get_object(self, request):
@@ -413,30 +404,33 @@ class SearchFeed(AcquisitionFeed):
             # query is set above.
             log.debug("Inline query = [%s], criteria: %s" % (query, criteria))
 
-        srch = Search()
-
-        book_hit_filter = srch.index.Q(book_id__any=True)
-        filters = [book_hit_filter] + [srch.index.Q(
-            **{self.PARAMS_TO_FIELDS.get(cn, cn): criteria[cn]}
-            ) for cn in self.MATCHES.keys() if cn in criteria
-            if criteria[cn]]
-
+        books = Book.objects.filter(findable=True).annotate(
+            search_vector=UnaccentSearchVector('title')
+        )
         if query:
-            q = srch.index.query(
-                reduce(
-                    operator.or_,
-                    [srch.index.Q(**{self.PARAMS_TO_FIELDS.get(cn, cn): query}) for cn in self.MATCHES.keys()],
-                    srch.index.Q()))
-        else:
-            q = srch.index.query(srch.index.Q())
-
-        q = srch.apply_filters(q, filters).field_limit(score=True, fields=['book_id'])
-        results = q.execute()
-
-        book_scores = dict([(r['book_id'], r['score']) for r in results])
-        books = Book.objects.filter(findable=True, id__in=set([r['book_id'] for r in results]))
-        books = list(books)
-        books.sort(reverse=True, key=lambda book: book_scores[book.id])
+            squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+            books = books.filter(search_vector=squery)
+        if criteria['author']:
+            authors = Tag.objects.filter(category='author').annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=UnaccentSearchQuery(criteria['author'], config=settings.SEARCH_CONFIG))
+            books = books.filter(tag_relations__tag__in=authors)
+        if criteria['categories']:
+            tags = Tag.objects.filter(category__in=('genre', 'kind', 'epoch')).annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=UnaccentSearchQuery(criteria['categories'], config=settings.SEARCH_CONFIG))
+            books = books.filter(tag_relations__tag__in=tags)
+        if criteria['translator']:
+            # TODO
+            pass
+        if criteria['title']:
+            books = books.filter(
+                search_vector=UnaccentSearchQuery(criteria['title'], config=settings.SEARCH_CONFIG)
+            )
+
+        books = books.exclude(ancestor__in=books)
+
+        books = books.order_by('popularity__count')
         return books
 
     def get_link(self, query):
index 2e1e0b9..5e94d5e 100644 (file)
@@ -110,15 +110,3 @@ class BookStub(models.Model):
 
     def pretty_title(self, html_links=False):
         return ', '.join((self.author, self.title))
-
-
-if not settings.NO_SEARCH_INDEX:
-    def update_index(sender, instance, **kwargs):
-        from search.index import Index
-        idx = Index()
-        idx.index_tags(instance, remove_only='created' not in kwargs)
-
-    post_delete.connect(update_index, Author)
-    post_delete.connect(update_index, BookStub)
-    post_save.connect(update_index, Author)
-    post_save.connect(update_index, BookStub)
index b9ddcae..2dadd0c 100644 (file)
@@ -180,7 +180,7 @@ class Picture(models.Model):
             return None
 
     @classmethod
-    def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False, search_index=True):
+    def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False):
         """
         Import xml and it's accompanying image file.
         If image file is missing, it will be fetched by librarian.picture.ImageStore
@@ -305,8 +305,6 @@ class Picture(models.Model):
             picture.xml_file.save("%s.xml" % picture.slug, File(xml_file))
             picture.save()
             tasks.generate_picture_html(picture.id)
-            if not settings.NO_SEARCH_INDEX and search_index:
-                tasks.index_picture.delay(picture.id, picture_info=picture_xml.picture_info)
 
         if close_xml_file:
             xml_file.close()
@@ -378,17 +376,3 @@ class Picture(models.Model):
     def clear_cache(self):
         clear_cached_renders(self.mini_box)
         clear_cached_renders(self.midi_box)
-
-    def search_index(self, picture_info=None, index=None, index_tags=True, commit=True):
-        if index is None:
-            from search.index import Index
-            index = Index()
-        try:
-            index.index_picture(self, picture_info)
-            if index_tags:
-                index.index_tags()
-            if commit:
-                index.index.commit()
-        except Exception as e:
-            index.index.rollback()
-            raise e
index ff9aa13..86b9829 100644 (file)
@@ -2,8 +2,6 @@
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 import json
-from traceback import print_exc
-
 from celery import shared_task
 from django.core.files.base import ContentFile
 from django.template.loader import render_to_string
@@ -20,14 +18,3 @@ def generate_picture_html(picture_id):
                 'themes': areas_json['themes'],
                 })
     pic.html_file.save("%s.html" % pic.slug, ContentFile(html_text))
-
-
-@shared_task
-def index_picture(picture_id, picture_info=None, **kwargs):
-    from picture.models import Picture
-    try:
-        return Picture.objects.get(id=picture_id).search_index(picture_info, **kwargs)
-    except Exception as e:
-        print("Exception during index: %s" % e)
-        print_exc()
-        raise e
diff --git a/src/search/custom.py b/src/search/custom.py
deleted file mode 100644 (file)
index 9337157..0000000
+++ /dev/null
@@ -1,154 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import re
-from urllib.parse import urlencode
-import warnings
-from httplib2 import socket
-from lxml import etree
-from scorched import connection, exc, search
-
-
-class CustomSolrConnection(connection.SolrConnection):
-    def __init__(self, *args, **kw):
-        super(CustomSolrConnection, self).__init__(*args, **kw)
-        self.analysis_url = self.url + "analysis/field/"
-
-    def analyze(self, params):
-        qs = urlencode(params)
-        url = "%s?%s" % (self.analysis_url, qs)
-        if len(url) > self.max_length_get_url:
-            warnings.warn("Long query URL encountered - POSTing instead of GETting. "
-                          "This query will not be cached at the HTTP layer")
-            url = self.analysis_url
-            kwargs = dict(
-                method="POST",
-                data=qs,
-                headers={"Content-Type": "application/x-www-form-urlencoded"},
-            )
-        else:
-            kwargs = dict(method="GET")
-        response = self.request(url=url, **kwargs)
-        if response.status_code != 200:
-            raise exc.SolrError(response)
-        return response.content
-
-
-class CustomSolrInterface(connection.SolrInterface):
-    # just copied from parent and SolrConnection -> CustomSolrConnection
-    def __init__(self, url, http_connection=None, mode='',
-                 retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL,
-                 search_timeout=()):
-        """
-        :param url: url to Solr
-        :type url: str
-        :param http_connection: optional -- already existing connection
-        :type http_connection: requests connection
-        :param mode: optional -- mode (readable, writable) Solr
-        :type mode: str
-        :param retry_timeout: optional -- timeout until retry
-        :type retry_timeout: int
-        :param max_length_get_url: optional -- max length until switch to post
-        :type max_length_get_url: int
-        :param search_timeout: (optional) How long to wait for the server to
-                               send data before giving up, as a float, or a
-                               (connect timeout, read timeout) tuple.
-        :type search_timeout: float or tuple
-        """
-
-        self.conn = CustomSolrConnection(
-            url, http_connection, mode, retry_timeout, max_length_get_url)
-        self.schema = self.init_schema()
-        self._datefields = self._extract_datefields(self.schema)
-
-
-    def _analyze(self, **kwargs):
-        if not self.conn.readable:
-            raise TypeError("This Solr instance is only for writing")
-        args = {
-            'analysis_showmatch': True
-            }
-        if 'field' in kwargs:
-            args['analysis_fieldname'] = kwargs['field']
-        if 'text' in kwargs:
-            args['analysis_fieldvalue'] = kwargs['text']
-        if 'q' in kwargs:
-            args['q'] = kwargs['q']
-        if 'query' in kwargs:
-            args['q'] = kwargs['q']
-
-        params = [
-            (k.replace('_', '.'), v)
-            for (k, v) in search.params_from_dict(**args)
-        ]
-
-        content = self.conn.analyze(params)
-        doc = etree.fromstring(content)
-        return doc
-
-    def highlight(self, **kwargs):
-        doc = self._analyze(**kwargs)
-        analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
-        matches = set()
-        for wrd in analyzed:
-            start = int(wrd.xpath("int[@name='start']")[0].text)
-            end = int(wrd.xpath("int[@name='end']")[0].text)
-            matches.add((start, end))
-
-        if matches:
-            return self.substring(
-                kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
-        else:
-            return None
-
-    def analyze(self, **kwargs):
-        doc = self._analyze(**kwargs)
-        terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
-        terms = map(lambda n: str(n.text), terms)
-        return terms
-
-    def expand_margins(self, text, start, end):
-        totlen = len(text)
-
-        def is_boundary(x):
-            ws = re.compile(r"\W", re.UNICODE)
-            return bool(ws.match(x))
-
-        while start > 0:
-            if is_boundary(text[start - 1]):
-                break
-            start -= 1
-
-        while end < totlen - 1:
-            if is_boundary(text[end + 1]):
-                break
-            end += 1
-
-        return start, end
-
-    def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
-        totlen = len(text)
-        matches_margins = [
-            ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
-
-        # lets start with first match
-        (start, end) = matches_margins[0][1]
-        new_matches = [matches_margins[0][0]]
-
-        for (m, (s, e)) in matches_margins[1:]:
-            if end < s or start > e:
-                continue
-            start = min(start, s)
-            end = max(end, e)
-            new_matches.append(m)
-
-        snip = text[start:end]
-        new_matches.sort(key=lambda a: -a[0])
-
-        for (s, e) in new_matches:
-            off = -start
-            snip = snip[:e + off] + mark[1] + snip[e + off:]
-            snip = snip[:s + off] + mark[0] + snip[s + off:]
-        snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)
-
-        return snip
index 176c73e..3f6c99b 100644 (file)
@@ -158,8 +158,8 @@ class SearchFilters(forms.Form):
     def results(self):
         qs = self.get_querysets()
         query = self.cleaned_data['q']
-        squery = UnaccentSearchQuery(query, config='polish')
-        query = SearchQuery(query, config='polish')
+        squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+        query = SearchQuery(query, config=settings.SEARCH_CONFIG)
         books = qs['book'].annotate(
             search_vector=UnaccentSearchVector('title')
         ).filter(search_vector=squery)
@@ -169,7 +169,7 @@ class SearchFilters(forms.Form):
                     headline=SearchHeadline(
                         'text',
                         query,
-                        config='polish',
+                        config=settings.SEARCH_CONFIG,
                         start_sel='<strong>',
                         stop_sel='</strong>',
                     )
index 4606f57..fc9e9d5 100644 (file)
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from functools import reduce, total_ordering
-from itertools import chain
-import logging
-import operator
-import os
 import re
-from django.conf import settings
-from librarian import dcparser
-import librarian.meta.types.person
-import librarian.meta.types.text
 from librarian.parser import WLDocument
 from lxml import etree
-import scorched
-import catalogue.models
-import picture.models
-from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from wolnelektury.utils import makedirs
-from . import custom
 
-log = logging.getLogger('search')
 
-
-if os.path.isfile(settings.SOLR_STOPWORDS):
-    stopwords = set(
-        line.strip()
-        for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
-else:
-    stopwords = set()
-
-
-class SolrIndex(object):
-    def __init__(self, mode=None):
-        self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
-
-
-class Snippets(object):
-    """
-    This class manages snippet files for indexed object (book)
-    the snippets are concatenated together, and their positions and
-    lengths are kept in lucene index fields.
-    """
-    SNIPPET_DIR = "snippets"
-
-    def __init__(self, book_id, revision=None):
-        makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
-        self.book_id = book_id
-        self.revision = revision
-        self.file = None
-        self.position = None
-
-    @property
-    def path(self):
-        if self.revision:
-            fn = "%d.%d" % (self.book_id, self.revision)
-        else:
-            fn = "%d" % self.book_id
-
-        return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
-
-    def open(self, mode='r'):
-        """
-        Open the snippet file. Call .close() afterwards.
-        """
-        if 'b' not in mode:
-            mode += 'b'
-
-        if 'w' in mode:
-            if os.path.exists(self.path):
-                self.revision = 1
-                while True:
-                    if not os.path.exists(self.path):
-                        break
-                    self.revision += 1
-
-        self.file = open(self.path, mode)
-        self.position = 0
-        return self
-
-    def add(self, snippet):
-        """
-        Append a snippet (unicode) to the snippet file.
-        Return a (position, length) tuple
-        """
-        txt = snippet.encode('utf-8')
-        l = len(txt)
-        self.file.write(txt)
-        pos = (self.position, l)
-        self.position += l
-        return pos
-
-    def get(self, pos):
-        """
-        Given a tuple of (position, length) return an unicode
-        of the snippet stored there.
-        """
-        self.file.seek(pos[0], 0)
-        try:
-            txt = self.file.read(pos[1]).decode('utf-8')
-        except:
-            return ''
-        return txt
-
-    def close(self):
-        """Close snippet file"""
-        if self.file:
-            self.file.close()
-
-    def remove(self):
-        self.revision = None
-        try:
-            os.unlink(self.path)
-            self.revision = 0
-            while True:
-                self.revision += 1
-                os.unlink(self.path)
-        except OSError:
-            pass
-
-
-class Index(SolrIndex):
+class Index:
     """
     Class indexing books.
     """
-    def __init__(self):
-        super(Index, self).__init__(mode='rw')
-
-    def remove_snippets(self, book):
-        book.snippet_set.all().delete()
-
-    def add_snippet(self, book, doc):
-        assert book.id == doc.pop('book_id')
-        # Fragments already exist and can be indexed where they live.
-        if 'fragment_anchor' in doc:
-            return
-
-        text = doc.pop('text')
-        header_index = doc.pop('header_index')
-        book.snippet_set.create(
-            sec=header_index,
-            text=text,
-        )
-
-    def delete_query(self, *queries):
-        """
-        index.delete(queries=...) doesn't work, so let's reimplement it
-        using deletion of list of uids.
-        """
-        uids = set()
-        for q in queries:
-            if isinstance(q, scorched.search.LuceneQuery):
-                q = self.index.query(q)
-            q.field_limiter.update(['uid'])
-            st = 0
-            rows = 100
-            while True:
-                ids = q.paginate(start=st, rows=rows).execute()
-                if not len(ids):
-                    break
-                for res in ids:
-                    uids.add(res['uid'])
-                st += rows
-        if uids:
-            # FIXME: With Solr API change, this doesn't work.
-            #self.index.delete(uids)
-            return True
-        else:
-            return False
-
-    def index_tags(self, *tags, **kw):
-        """
-        Re-index global tag list.
-        Removes all tags from index, then index them again.
-        Indexed fields include: id, name (with and without polish stems), category
-        """
-        log.debug("Indexing tags")
-        remove_only = kw.get('remove_only', False)
-        # first, remove tags from index.
-        if tags:
-            tag_qs = []
-            for tag in tags:
-                q_id = self.index.Q(tag_id=tag.id)
-
-                if isinstance(tag, PDCounterAuthor):
-                    q_cat = self.index.Q(tag_category='pd_author')
-                elif isinstance(tag, PDCounterBook):
-                    q_cat = self.index.Q(tag_category='pd_book')
-                else:
-                    q_cat = self.index.Q(tag_category=tag.category)
-
-                q_id_cat = self.index.Q(q_id & q_cat)
-                tag_qs.append(q_id_cat)
-            self.delete_query(*tag_qs)
-        else:  # all
-            q = self.index.Q(tag_id__any=True)
-            self.delete_query(q)
-
-        if not remove_only:
-            # then add them [all or just one passed]
-            if not tags:
-                tags = chain(
-                    catalogue.models.Tag.objects.exclude(category='set'),
-                    PDCounterAuthor.objects.all(),
-                    PDCounterBook.objects.all())
-
-            for tag in tags:
-                if isinstance(tag, PDCounterAuthor):
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.name,
-                        "tag_name_pl": tag.name,
-                        "tag_category": 'pd_author',
-                        "is_pdcounter": True,
-                        "uid": "tag%d_pd_a" % tag.id
-                        }
-                elif isinstance(tag, PDCounterBook):
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.title,
-                        "tag_name_pl": tag.title,
-                        "tag_category": 'pd_book',
-                        "is_pdcounter": True,
-                        "uid": "tag%d_pd_b" % tag.id
-                        }
-                else:
-                    doc = {
-                        "tag_id": int(tag.id),
-                        "tag_name": tag.name,
-                        "tag_name_pl": tag.name,
-                        "tag_category": tag.category,
-                        "is_pdcounter": False,
-                        "uid": "tag%d" % tag.id
-                        }
-                self.index.add(doc)
-
-    def create_book_doc(self, book):
-        """
-        Create a lucene document referring book id.
-        """
-        doc = {'book_id': int(book.id)}
-        if book.parent is not None:
-            doc['parent_id'] = int(book.parent.id)
-        return doc
-
-    def remove_book(self, book, remove_snippets=True, legacy=True):
-        """Removes a book from search index.
-        book - Book instance."""
-        if legacy:
-          self.delete_query(self.index.Q(book_id=book.id))
-
-          if remove_snippets:
-            snippets = Snippets(book.id)
-            snippets.remove()
-        self.remove_snippets(book)
-
-    def index_book(self, book, book_info=None, overwrite=True, legacy=True):
-        """
-        Indexes the book.
-        Creates a lucene document for extracted metadata
-        and calls self.index_content() to index the contents of the book.
-        """
-        if not book.xml_file: return
-
-        if overwrite:
-            # we don't remove snippets, since they might be still needed by
-            # threads using not reopened index
-            self.remove_book(book, remove_snippets=False, legacy=legacy)
-
-        book_doc = self.create_book_doc(book)
-        meta_fields = self.extract_metadata(book, book_info, dc_only=[
-            'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
-        # let's not index it - it's only used for extracting publish date
-        if 'source_name' in meta_fields:
-            del meta_fields['source_name']
-
-        for n, f in meta_fields.items():
-            book_doc[n] = f
-
-        book_doc['uid'] = "book%s" % book_doc['book_id']
-        if legacy:
-            self.index.add(book_doc)
-        del book_doc
-        book_fields = {
-            'title': meta_fields['title'],
-            'authors': meta_fields['authors'],
-            'published_date': meta_fields['published_date']
-            }
-
-        for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
-            if tag_name in meta_fields:
-                book_fields[tag_name] = meta_fields[tag_name]
-
-        self.index_content(book, book_fields=book_fields, legacy=legacy)
-
     master_tags = [
         'opowiadanie',
         'powiesc',
@@ -307,7 +23,7 @@ class Index(SolrIndex):
         'uwaga', 'extra', 'nota_red', 'abstrakt',
         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
         'didaskalia',
-        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
+        'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
     ]
 
     footnote_tags = ['pa', 'pt', 'pr', 'pe']
@@ -315,85 +31,41 @@ class Index(SolrIndex):
     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
-    published_date_re = re.compile("([0-9]+)[\]. ]*$")
-
-    def extract_metadata(self, book, book_info=None, dc_only=None):
-        """
-        Extract metadata from book and returns a map of fields keyed by fieldname
-        """
-        fields = {}
-
-        if book_info is None:
-            book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
-
-        fields['slug'] = book.slug
-        fields['is_book'] = True
-
-        # validator, name
-        for field in dcparser.BookInfo.FIELDS:
-            if dc_only and field.name not in dc_only:
-                continue
-            if hasattr(book_info, field.name):
-                if not getattr(book_info, field.name):
-                    continue
-                type_indicator = field.value_type
-                if issubclass(type_indicator, librarian.meta.types.text.TextValue):
-                    s = getattr(book_info, field.name)
-                    if field.multiple:
-                        s = ', '.join(s)
-                    fields[field.name] = s
-                elif issubclass(type_indicator, librarian.meta.types.person.Person):
-                    p = getattr(book_info, field.name)
-                    if isinstance(p, librarian.meta.types.person.Person):
-                        persons = str(p)
-                    else:
-                        persons = ', '.join(map(str, p))
-                    fields[field.name] = persons
-
-        # get published date
-        pd = None
-        if hasattr(book_info, 'source_name') and book_info.source_name:
-            match = self.published_date_re.search(book_info.source_name)
-            if match is not None:
-                pd = str(match.groups()[0])
-        if not pd:
-            pd = ""
-        fields["published_date"] = pd
-
-        return fields
-
-    # def add_gaps(self, fields, fieldname):
-    #     """
-    #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
-    #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
-    #     """
-    #     def gap():
-    #         while True:
-    #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
-    #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
-
-    def get_master(self, root):
+    @classmethod
+    def get_master(cls, root):
         """
         Returns the first master tag from an etree.
         """
         for master in root.iter():
-            if master.tag in self.master_tags:
+            if master.tag in cls.master_tags:
                 return master
 
-    def index_content(self, book, book_fields, legacy=True):
+    @staticmethod
+    def add_snippet(book, text, position):
+        book.snippet_set.create(
+            sec=position + 1,
+            text=text
+        )
+
+    @classmethod
+    def index_book(cls, book):
         """
         Walks the book XML and extract content from it.
         Adds parts for each header tag and for each fragment.
         """
+        if not book.xml_file: return
+
+        book.snippet_set.all().delete()
+
         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
         root = wld.edoc.getroot()
 
-        master = self.get_master(root)
+        master = cls.get_master(root)
         if master is None:
             return []
 
         def walker(node):
-            if node.tag not in self.ignore_content_tags:
+            if node.tag not in cls.ignore_content_tags:
                 yield node, None, None
                 if node.text is not None:
                     yield None, node.text, None
@@ -407,627 +79,43 @@ class Index(SolrIndex):
             return
 
         def fix_format(text):
-            # separator = [" ", "\t", ".", ";", ","]
             if isinstance(text, list):
-                # need to join it first
                 text = filter(lambda s: s is not None, content)
                 text = ' '.join(text)
-                # for i in range(len(text)):
-                #     if i > 0:
-                #         if text[i][0] not in separator\
-                #             and text[i - 1][-1] not in separator:
-                #          text.insert(i, " ")
 
             return re.sub("(?m)/$", "", text)
 
-        def add_part(snippets, **fields):
-            doc = self.create_book_doc(book)
-            for n, v in book_fields.items():
-                doc[n] = v
-
-            doc['header_index'] = fields["header_index"]
-            doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
-            doc['header_type'] = fields['header_type']
-
-            doc['text'] = fields['text']
-
-            # snippets
-            snip_pos = snippets.add(fields["text"])
-
-            doc['snippets_position'] = snip_pos[0]
-            doc['snippets_length'] = snip_pos[1]
-            if snippets.revision:
-                doc["snippets_revision"] = snippets.revision
-
-            if 'fragment_anchor' in fields:
-                doc["fragment_anchor"] = fields['fragment_anchor']
-
-            if 'themes' in fields:
-                doc['themes'] = fields['themes']
-            doc['uid'] = "part%s-%s-%s-%s" % (
-                book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
-            return doc
-
-        fragments = {}
-        snippets = Snippets(book.id).open('w')
-        try:
-            for header, position in zip(list(master), range(len(master))):
-
-                if header.tag in self.skip_header_tags:
-                    continue
-                if header.tag is etree.Comment:
-                    continue
-
-                # section content
-                content = []
-                footnote = []
-
-                def all_content(text):
-                    for frag in fragments.values():
-                        frag['text'].append(text)
-                    content.append(text)
-                handle_text = [all_content]
-
-                for start, text, end in walker(header):
-                    # handle footnotes
-                    if start is not None and start.tag in self.footnote_tags:
-                        footnote = []
-
-                        def collect_footnote(t):
-                            footnote.append(t)
-
-                        handle_text.append(collect_footnote)
-                    elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
-                        handle_text.pop()
-                        doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       text=''.join(footnote))
-                        self.add_snippet(book, doc)
-                        if legacy:
-                            self.index.add(doc)
-                        footnote = []
-
-                    # handle fragments and themes.
-                    if start is not None and start.tag == 'begin':
-                        fid = start.attrib['id'][1:]
-                        fragments[fid] = {
-                            'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-
-                    # themes for this fragment
-                    elif start is not None and start.tag == 'motyw':
-                        fid = start.attrib['id'][1:]
-                        handle_text.append(lambda text: None)
-                        if start.text is not None:
-                            fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
-                    elif end is not None and end.tag == 'motyw':
-                        handle_text.pop()
-
-                    elif start is not None and start.tag == 'end':
-                        fid = start.attrib['id'][1:]
-                        if fid not in fragments:
-                            continue  # a broken <end> node, skip it
-                        frag = fragments[fid]
-                        if not frag['themes']:
-                            continue  # empty themes list.
-                        del fragments[fid]
-
-                        doc = add_part(snippets,
-                                       header_type=frag['start_header'],
-                                       header_index=frag['start_section'],
-                                       header_span=position - frag['start_section'] + 1,
-                                       fragment_anchor=fid,
-                                       text=fix_format(frag['text']),
-                                       themes=frag['themes'])
-                        # Add searchable fragment
-                        self.add_snippet(book, doc)
-                        if legacy:
-                            self.index.add(doc)
-
-                        # Collect content.
-
-                    if text is not None and handle_text is not []:
-                        hdl = handle_text[-1]
-                        hdl(text)
-
-                        # in the end, add a section text.
-                doc = add_part(snippets, header_index=position,
-                               header_type=header.tag, text=fix_format(content))
-
-                self.add_snippet(book, doc)
-                if legacy:
-                    self.index.add(doc)
-
-        finally:
-            snippets.close()
-
-    def remove_picture(self, picture_or_id):
-        """Removes a picture from search index."""
-        if isinstance(picture_or_id, picture.models.Picture):
-            picture_id = picture_or_id.id
-        else:
-            picture_id = picture_or_id
-        self.delete_query(self.index.Q(picture_id=picture_id))
-
-    def index_picture(self, picture, picture_info=None, overwrite=True):
-        """
-        Indexes the picture.
-        Creates a lucene document for extracted metadata
-        and calls self.index_area() to index the contents of the picture.
-        """
-        if overwrite:
-            # we don't remove snippets, since they might be still needed by
-            # threads using not reopened index
-            self.remove_picture(picture)
-
-        picture_doc = {'picture_id': int(picture.id)}
-        meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
-            'authors', 'title', 'epochs', 'kinds', 'genres'])
-
-        picture_doc.update(meta_fields)
-
-        picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
-        self.index.add(picture_doc)
-        del picture_doc['is_book']
-        for area in picture.areas.all():
-            self.index_area(area, picture_fields=picture_doc)
-
-    def index_area(self, area, picture_fields):
-        """
-        Indexes themes and objects on the area.
-        """
-        doc = dict(picture_fields)
-        doc['area_id'] = area.id
-        doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
-        doc['uid'] = 'area%s' % area.id
-        self.index.add(doc)
-
-
-@total_ordering
-class SearchResult(object):
-    def __init__(self, doc, how_found=None, query_terms=None):
-        self.boost = 1.0
-        self._hits = []
-        self._processed_hits = None  # processed hits
-        self.snippets = []
-        self.query_terms = query_terms
-        self._book = None
-
-        if 'score' in doc:
-            self._score = doc['score']
-        else:
-            self._score = 0
-
-        self.book_id = int(doc["book_id"])
-
-        try:
-            self.published_date = int(doc.get("published_date"))
-        except ValueError:
-            self.published_date = 0
-
-        # content hits
-        header_type = doc.get("header_type", None)
-        # we have a content hit in some header of fragment
-        if header_type is not None:
-            sec = (header_type, int(doc["header_index"]))
-            header_span = doc['header_span']
-            header_span = header_span is not None and int(header_span) or 1
-            fragment = doc.get("fragment_anchor", None)
-            snippets_pos = (doc['snippets_position'], doc['snippets_length'])
-            snippets_rev = doc.get('snippets_revision', None)
-
-            hit = (sec + (header_span,), fragment, self._score, {
-                'how_found': how_found,
-                'snippets_pos': snippets_pos,
-                'snippets_revision': snippets_rev,
-                'themes': doc.get('themes', []),
-                'themes_pl': doc.get('themes_pl', [])
-                })
-
-            self._hits.append(hit)
-
-    @classmethod
-    def from_book(cls, book, how_found=None, query_terms=None):
-        doc = {
-            'score': book.popularity.count,
-            'book_id': book.id,
-            'published_date': 0,
-        }
-        result = cls(doc, how_found=how_found, query_terms=query_terms)
-        result._book = book
-        return result
-
-    def __str__(self):
-        return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
-            (self.book_id, len(self._hits),
-             len(self._processed_hits) if self._processed_hits else -1,
-             self._score, len(self.snippets))
-
-    def __bytes__(self):
-        return str(self).encode('utf-8')
-
-    @property
-    def score(self):
-        return self._score * self.boost
-
-    def merge(self, other):
-        if self.book_id != other.book_id:
-            raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
-        self._hits += other._hits
-        self._score += max(other._score, 0)
-        return self
-
-    def get_book(self):
-        if self._book is not None:
-            return self._book
-        try:
-            self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
-        except catalogue.models.Book.DoesNotExist:
-            self._book = None
-        return self._book
-
-    book = property(get_book)
-
-    POSITION = 0
-    FRAGMENT = 1
-    POSITION_INDEX = 1
-    POSITION_SPAN = 2
-    SCORE = 2
-    OTHER = 3
-
-    @property
-    def hits(self):
-        if self._processed_hits is not None:
-            return self._processed_hits
-
-        # to sections and fragments
-        frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
-
-        sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
-
-        # sections not covered by fragments
-        sect = filter(lambda s: 0 == len(list(filter(
-            lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
-                      f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
-
-        def remove_duplicates(lst, keyfn, larger):
-            els = {}
-            for e in lst:
-                eif = keyfn(e)
-                if eif in els:
-                    if larger(els[eif], e):
-                        continue
-                els[eif] = e
-            return els.values()
-
-        # remove fragments with duplicated fid's and duplicated snippets
-        frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
-
-        # remove duplicate sections
-        sections = {}
-
-        for s in sect:
-            si = s[self.POSITION][self.POSITION_INDEX]
-            # skip existing
-            if si in sections:
-                if sections[si]['score'] >= s[self.SCORE]:
-                    continue
-
-            m = {'score': s[self.SCORE],
-                 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
-                 }
-            m.update(s[self.OTHER])
-            sections[si] = m
-
-        hits = list(sections.values())
-
-        for f in frags:
-            try:
-                frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
-            except catalogue.models.Fragment.DoesNotExist:
-                # stale index
+        for position, header in enumerate(master):
+            if header.tag in cls.skip_header_tags:
                 continue
-            # Figure out if we were searching for a token matching some word in theme name.
-            themes = frag.tags.filter(category='theme')
-            themes_hit = set()
-            if self.query_terms is not None:
-                for i in range(0, len(f[self.OTHER]['themes'])):
-                    tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(str.lower, tms)
-                    for qt in self.query_terms:
-                        if qt in tms:
-                            themes_hit.add(f[self.OTHER]['themes'][i])
-                            break
-
-            def theme_by_name(n):
-                th = list(filter(lambda t: t.name == n, themes))
-                if th:
-                    return th[0]
-                else:
-                    return None
-            themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
-
-            m = {'score': f[self.SCORE],
-                 'fragment': frag,
-                 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
-                 'themes': themes,
-                 'themes_hit': themes_hit
-                 }
-            m.update(f[self.OTHER])
-            hits.append(m)
-
-        hits.sort(key=lambda h: h['score'], reverse=True)
-
-        self._processed_hits = hits
-
-        return hits
-
-    @staticmethod
-    def aggregate(*result_lists):
-        books = {}
-        for rl in result_lists:
-            for r in rl:
-                if r.book_id in books:
-                    books[r.book_id].merge(r)
-                else:
-                    books[r.book_id] = r
-        return books.values()
-
-    def get_sort_key(self):
-        return (-self.score,
-                self.published_date,
-                self.book.sort_key_author if self.book else '',
-                self.book.sort_key if self.book else '')
-
-    def __lt__(self, other):
-        return self.get_sort_key() > other.get_sort_key()
-
-    def __eq__(self, other):
-        return self.get_sort_key() == other.get_sort_key()
-
-    def __len__(self):
-        return len(self.hits)
-
-    def snippet_pos(self, idx=0):
-        return self.hits[idx]['snippets_pos']
-
-    def snippet_revision(self, idx=0):
-        try:
-            return self.hits[idx]['snippets_revision']
-        except (IndexError, KeyError):
-            return None
-
-
-@total_ordering
-class PictureResult(object):
-    def __init__(self, doc, how_found=None, query_terms=None):
-        self.boost = 1.0
-        self.query_terms = query_terms
-        self._picture = None
-        self._hits = []
-        self._processed_hits = None
-
-        if 'score' in doc:
-            self._score = doc['score']
-        else:
-            self._score = 0
-
-        self.picture_id = int(doc["picture_id"])
-
-        if doc.get('area_id'):
-            hit = (self._score, {
-                'how_found': how_found,
-                'area_id': doc['area_id'],
-                'themes': doc.get('themes', []),
-                'themes_pl': doc.get('themes_pl', []),
-            })
-
-            self._hits.append(hit)
-
-    def __str__(self):
-        return "<PR id=%d score=%f >" % (self.picture_id, self._score)
-
-    def __repr__(self):
-        return str(self)
-
-    @property
-    def score(self):
-        return self._score * self.boost
-
-    def merge(self, other):
-        if self.picture_id != other.picture_id:
-            raise ValueError(
-                "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
-        self._hits += other._hits
-        self._score += max(other._score, 0)
-        return self
-
-    SCORE = 0
-    OTHER = 1
-
-    @property
-    def hits(self):
-        if self._processed_hits is not None:
-            return self._processed_hits
-
-        hits = []
-        for hit in self._hits:
-            try:
-                area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
-            except picture.models.PictureArea.DoesNotExist:
-                # stale index
+            if header.tag is etree.Comment:
                 continue
-            # Figure out if we were searching for a token matching some word in theme name.
-            themes_hit = set()
-            if self.query_terms is not None:
-                for i in range(0, len(hit[self.OTHER]['themes'])):
-                    tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
-                    tms = map(str.lower, tms)
-                    for qt in self.query_terms:
-                        if qt in tms:
-                            themes_hit.add(hit[self.OTHER]['themes'][i])
-                            break
 
-            m = {
-                'score': hit[self.SCORE],
-                'area': area,
-                'themes_hit': themes_hit,
-            }
-            m.update(hit[self.OTHER])
-            hits.append(m)
+            # section content
+            content = []
+            footnote = []
 
-        hits.sort(key=lambda h: h['score'], reverse=True)
-        hits = hits[:1]
-        self._processed_hits = hits
-        return hits
+            def all_content(text):
+                content.append(text)
+            handle_text = [all_content]
 
-    def get_picture(self):
-        if self._picture is None:
-            self._picture = picture.models.Picture.objects.get(id=self.picture_id)
-        return self._picture
-
-    picture = property(get_picture)
-
-    @staticmethod
-    def aggregate(*result_lists):
-        books = {}
-        for rl in result_lists:
-            for r in rl:
-                if r.picture_id in books:
-                    books[r.picture_id].merge(r)
-                else:
-                    books[r.picture_id] = r
-        return books.values()
+            for start, text, end in walker(header):
+                # handle footnotes
+                if start is not None and start.tag in cls.footnote_tags:
+                    footnote = []
 
-    def __lt__(self, other):
-        return self.score < other.score
+                    def collect_footnote(t):
+                        footnote.append(t)
 
-    def __eq__(self, other):
-        return self.score == other.score
-
-
-class Search(SolrIndex):
-    """
-    Search facilities.
-    """
-    def __init__(self, default_field="text"):
-        super(Search, self).__init__(mode='r')
-
-    def make_term_query(self, query, field='text', modal=operator.or_):
-        """
-        Returns term queries joined by boolean query.
-        modal - applies to boolean query
-        fuzzy - should the query by fuzzy.
-        """
-        if query is None:
-            query = ''
-        q = self.index.Q()
-        q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
-
-        return q
-
-    def search_by_author(self, words):
-        from catalogue.models import Book
-        books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
-        for word in words:
-            books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
-        return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
-
-    def search_words(self, words, fields, required=None, book=True, picture=False):
-        if book and not picture and fields == ['authors']:
-            return self.search_by_author(words)
-        filters = []
-        for word in words:
-            if book or picture or (word not in stopwords):
-                word_filter = None
-                for field in fields:
-                    q = self.index.Q(**{field: word})
-                    if word_filter is None:
-                        word_filter = q
-                    else:
-                        word_filter |= q
-                filters.append(word_filter)
-        if required:
-            required_filter = None
-            for field in required:
-                for word in words:
-                    if book or picture or (word not in stopwords):
-                        q = self.index.Q(**{field: word})
-                        if required_filter is None:
-                            required_filter = q
-                        else:
-                            required_filter |= q
-            filters.append(required_filter)
-        if not filters:
-            return []
-        params = {}
-        if book:
-            params['is_book'] = True
-        if picture:
-            params['picture_id__gt'] = 0
-        else:
-            params['book_id__gt'] = 0
-        query = self.index.query(**params)
-        query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
-        result_class = PictureResult if picture else SearchResult
-        return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
-
-    def get_snippets(self, searchresult, query, field='text', num=1):
-        """
-        Returns a snippet for found scoreDoc.
-        """
-        maxnum = len(searchresult)
-        if num is None or num < 0 or num > maxnum:
-            num = maxnum
-        book_id = searchresult.book_id
-        revision = searchresult.snippet_revision()
-        snippets = Snippets(book_id, revision=revision)
-        snips = [None] * maxnum
-        try:
-            snippets.open()
-            idx = 0
-            while idx < maxnum and num > 0:
-                position, length = searchresult.snippet_pos(idx)
-                if position is None or length is None:
-                    continue
-                text = snippets.get((int(position),
-                                     int(length)))
-                snip = self.index.highlight(text=text, field=field, q=query)
-                if not snip and field == 'text':
-                    snip = self.index.highlight(text=text, field='text_nonstem', q=query)
-                if snip not in snips:
-                    snips[idx] = snip
-                    if snip:
-                        num -= 1
-                idx += 1
-
-        except IOError as e:
-            book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
-            if not book:
-                log.error("Book does not exist for book id = %d" % book_id)
-            elif not book.get().children.exists():
-                log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
-            return []
-        finally:
-            snippets.close()
-
-        # remove verse end markers..
-        snips = [s.replace("/\n", "\n") if s else s for s in snips]
-
-        searchresult.snippets = snips
-
-        return snips
-
-    @staticmethod
-    def apply_filters(query, filters):
-        """
-        Apply filters to a query
-        """
-        if filters is None:
-            filters = []
-        filters = filter(lambda x: x is not None, filters)
-        for f in filters:
-            query = query.query(f)
-        return query
+                    handle_text.append(collect_footnote)
+                elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
+                    handle_text.pop()
+                    cls.add_snippet(book, ''.join(footnote), position)
+                    footnote = []
 
+                if text is not None and handle_text is not []:
+                    hdl = handle_text[-1]
+                    hdl(text)
 
-if getattr(settings, 'SEARCH_MOCK', False):
-    from .mock_search import Search
+            # in the end, add a section text.
+            cls.add_snippet(book, fix_format(content), position)
diff --git a/src/search/management/__init__.py b/src/search/management/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/search/management/commands/__init__.py b/src/search/management/commands/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/src/search/management/commands/reindex.py b/src/search/management/commands/reindex.py
deleted file mode 100644 (file)
index c2fe78e..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
-    """Ask a yes/no question via raw_input() and return their answer.
-
-    "question" is a string that is presented to the user.
-    "default" is the presumed answer if the user just hits <Enter>.
-        It must be "yes" (the default), "no" or None (meaning
-        an answer is required of the user).
-
-    The "answer" return value is one of "yes" or "no".
-    """
-    valid = {"yes": True, "y": True, "ye": True,
-             "no": False, "n": False}
-    if default is None:
-        prompt = " [y/n] "
-    elif default == "yes":
-        prompt = " [Y/n] "
-    elif default == "no":
-        prompt = " [y/N] "
-    else:
-        raise ValueError("invalid default answer: '%s'" % default)
-
-    while True:
-        sys.stdout.write(question + prompt)
-        choice = raw_input().lower()
-        if default is not None and choice == '':
-            return valid[default]
-        elif choice in valid:
-            return valid[choice]
-        else:
-            sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
-    help = 'Reindex everything.'
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-                '-n', '--book-id', action='store_true', dest='book_id',
-                default=False, help='book id instead of slugs')
-        parser.add_argument(
-                '-t', '--just-tags', action='store_true', dest='just_tags',
-                default=False, help='just reindex tags')
-        parser.add_argument(
-                '--start', dest='start_from', default=None,
-                help='start from this slug')
-        parser.add_argument(
-                '--stop', dest='stop_after', default=None,
-                help='stop after this slug')
-        parser.add_argument('args', nargs='*', metavar='slug/id')
-
-    def handle(self, **opts):
-        from catalogue.models import Book
-        from search.index import Index
-        idx = Index()
-        
-        if not opts['just_tags']:
-            if opts['args']:
-                books = []
-                for a in opts['args']:
-                    if opts['book_id']:
-                        books += Book.objects.filter(id=int(a)).all()
-                    else:
-                        books += Book.objects.filter(slug=a).all()
-            else:
-                books = list(Book.objects.order_by('slug'))
-            start_from = opts.get('start_from')
-            stop_after = opts.get('stop_after')
-            if start_from:
-                start_from = start_from.replace('-', '')
-            if stop_after:
-                stop_after = stop_after.replace('-', '')
-            while books:
-                try:
-                    b = books[0]
-                    slug = b.slug.replace('-', '')
-                    if stop_after and slug > stop_after:
-                        break
-                    if not start_from or slug >= start_from:
-                        print(b.slug)
-                        idx.index_book(b)
-                        idx.index.commit()
-                    books.pop(0)
-                except:
-                    traceback.print_exc()
-                    try:
-                        # we might not be able to rollback
-                        idx.index.rollback()
-                    except:
-                        pass
-                    retry = query_yes_no("Retry?")
-                    if not retry:
-                        break
-
-        print('Reindexing tags.')
-        idx.index_tags()
-        idx.index.commit()
diff --git a/src/search/management/commands/reindex_pictures.py b/src/search/management/commands/reindex_pictures.py
deleted file mode 100644 (file)
index 8505189..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
-    """Ask a yes/no question via raw_input() and return their answer.
-
-    "question" is a string that is presented to the user.
-    "default" is the presumed answer if the user just hits <Enter>.
-        It must be "yes" (the default), "no" or None (meaning
-        an answer is required of the user).
-
-    The "answer" return value is one of "yes" or "no".
-    """
-    valid = {"yes": True, "y": True, "ye": True,
-             "no": False, "n": False}
-    if default is None:
-        prompt = " [y/n] "
-    elif default == "yes":
-        prompt = " [Y/n] "
-    elif default == "no":
-        prompt = " [y/N] "
-    else:
-        raise ValueError("invalid default answer: '%s'" % default)
-
-    while True:
-        sys.stdout.write(question + prompt)
-        choice = raw_input().lower()
-        if default is not None and choice == '':
-            return valid[default]
-        elif choice in valid:
-            return valid[choice]
-        else:
-            sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
-    help = 'Reindex pictures.'
-
-    def add_arguments(self, parser):
-        self.add_argument(
-                '-n', '--picture-id', action='store_true', dest='picture_id',
-                default=False, help='picture id instead of slugs')
-        self.add_argument('slug/id', nargs='*', metavar='slug/id')
-
-    def handle(self, **opts):
-        from picture.models import Picture
-        from search.index import Index
-        idx = Index()
-
-        if opts['args']:
-            pictures = []
-            for a in opts['args']:
-                if opts['picture_id']:
-                    pictures += Picture.objects.filter(id=int(a)).all()
-                else:
-                    pictures += Picture.objects.filter(slug=a).all()
-        else:
-            pictures = list(Picture.objects.order_by('slug'))
-        while pictures:
-            try:
-                p = pictures[0]
-                print(p.slug)
-                idx.index_picture(p)
-                idx.index.commit()
-                pictures.pop(0)
-            except:
-                traceback.print_exc()
-                try:
-                    # we might not be able to rollback
-                    idx.index.rollback()
-                except:
-                    pass
-                retry = query_yes_no("Retry?")
-                if not retry:
-                    break
diff --git a/src/search/management/commands/snippets.py b/src/search/management/commands/snippets.py
deleted file mode 100644 (file)
index 62512c9..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from glob import glob
-from os import path
-from django.conf import settings
-from django.core.management.base import BaseCommand
-
-
-class Command(BaseCommand):
-    help = 'Check snippets.'
-
-    def handle(self, *args, **opts):
-        sfn = glob(settings.SEARCH_INDEX+'snippets/*')
-        for fn in sfn:
-            print(fn)
-            bkid = path.basename(fn)
-            with open(fn) as f:
-                cont = f.read()
-                try:
-                    cont.decode('utf-8')
-                except UnicodeDecodeError:
-                    print("error in snippets %s" % bkid)
diff --git a/src/search/mock_search.py b/src/search/mock_search.py
deleted file mode 100644 (file)
index 33d2a5e..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from unittest.mock import Mock
-from catalogue.models import Book, Tag
-from random import randint, choice
-
-
-class Search(Mock):
-    """
-    Search mock for development without setting up Solr.
-
-    Instead of connecting to an actual search server, it returns
-    some random results for any query.
-    """
-    class MockIndex(Mock):
-        def analyze(*args, **kwargs):
-            return []
-
-    index = MockIndex()
-
-    def search_words(self, words, fields, required=None, book=True, picture=False):
-        from .index import SearchResult
-
-        max_results = 20
-        
-        if picture: return []
-
-        qs = Book.objects.filter(findable=True).order_by('?')
-        results = []
-        for book in qs[:randint(1, max_results)]:
-            doc = {
-                'score': randint(0, 100),
-                'book_id': book.pk,
-                'published_date': randint(1000, 1920),
-                }
-            res = SearchResult(doc, how_found='mock', query_terms=words)
-            results.append(res)
-        return results
-
index 34d9586..d63bafb 100644 (file)
@@ -1,40 +1,27 @@
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from unittest import skipIf
 from django.conf import settings
 from django.test.utils import override_settings
 from catalogue.test_utils import WLTestCase, get_fixture
-import tempfile
 from catalogue.models import Book
-from search.index import Index, Search
 import catalogue
 import opds
 
 
-@override_settings(SEARCH_INDEX=tempfile.mkdtemp(prefix='djangotest_search_'))
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False),
-        'Requires search server and NO_SEARCH_INDEX=False.')
 class BookSearchTests(WLTestCase):
     def setUp(self):
         WLTestCase.setUp(self)
 
-        index = Index()
-        self.search = Search()
-        index.delete_query(self.search.index.query(uid="*"))
-        index.index.commit()
+        with override_settings(NO_SEARCH_INDEX=False):
+            self.do_doktora = Book.from_xml_file(
+                get_fixture('do-doktora.xml', opds))
+            self.do_anusie = Book.from_xml_file(
+                get_fixture('fraszka-do-anusie.xml', catalogue))
 
-        self.do_doktora = Book.from_xml_file(
-            get_fixture('do-doktora.xml', opds))
-        self.do_anusie = Book.from_xml_file(
-            get_fixture('fraszka-do-anusie.xml', catalogue))
-
-    # TODO: Add slop option to sunburnt
-    # def test_search_perfect_parts(self):
-    #     books = self.search.search_phrase("Jakoż hamować")
-    #     assert len(books) == 2
-    #     for b in books:
-    #         b.book_id == self.book.id
-    #     a = SearchResult.aggregate(books)
-    #     # just one fragment hit.
-    #     assert len(a[0].hits) == 1
+    def test_search_perfect_parts(self):
+        response = self.client.get('/szukaj/?q=Jakoż hamować')
+        res = response.context['results']
+        self.assertEqual(len(res['snippet']), 1)
+        for b, s in res['snippet'].items():
+             self.assertEqual(b.id, self.do_anusie.id)
index 46e73c5..1e66d33 100644 (file)
@@ -6,6 +6,6 @@ from . import views
 
 
 urlpatterns = [
-    path('', views.main, name='wlsearch'),
+    path('', views.search, name='wlsearch'),
     path('hint/', views.hint, name='search_hint'),
 ]
index 6c0acf5..77ff1ae 100644 (file)
@@ -1,3 +1,4 @@
+from django.conf import settings
 from django.db.models import Func
 from django.contrib.postgres.search import SearchQuery, SearchVectorField
 
@@ -8,7 +9,8 @@ class UnaccentSearchQuery(SearchQuery):
     '''
     def as_sql(self, *args, **kwargs):
         sql, params = super().as_sql(*args, **kwargs)
-        sql = f'unaccent({sql}::text)::tsquery'
+        if settings.SEARCH_USE_UNACCENT:
+            sql = f'unaccent({sql}::text)::tsquery'
         return sql, params
 
 
@@ -19,10 +21,11 @@ class UnaccentSearchVector(Func):
     But user enters 'roze' -> stem leaves it as is, so we need original form in the vector.
     '''
     function='to_tsvector'
-    template = '''unaccent(
-      %(function)s('polish', %(expressions)s)::text)::tsvector ||
-     to_tsvector(
-       'polish_simple', 
-       unaccent(%(expressions)s)
-     )'''
+    if settings.SEARCH_USE_UNACCENT:
+        template = f'''unaccent(
+        %(function)s('{settings.SEARCH_CONFIG}', %(expressions)s)::text)::tsvector ||
+        to_tsvector(
+        '{settings.SEARCH_CONFIG_SIMPLE}', 
+        unaccent(%(expressions)s)
+        )'''
     output_field = SearchVectorField()
index b5cc0ba..e5ea598 100644 (file)
@@ -2,30 +2,18 @@
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 from django.conf import settings
-from django.http.response import HttpResponseRedirect
 from django.shortcuts import render
 from django.views.decorators import cache
 from django.http import HttpResponse, JsonResponse
 
 from catalogue.models import Book, Tag
-from pdcounter.models import Author
-from picture.models import Picture
-from search.index import Search, SearchResult, PictureResult
 from .forms import SearchFilters
-from suggest.forms import PublishingSuggestForm
 import re
 import json
 
 from wolnelektury.utils import re_escape
 
 
-def match_word_re(word):
-    if 'sqlite' in settings.DATABASES['default']['ENGINE']:
-        return r"\b%s\b" % word
-    elif 'mysql' in settings.DATABASES['default']['ENGINE']:
-        return "[[:<:]]%s[[:>:]]" % word
-
-
 query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
 
 
@@ -33,32 +21,6 @@ def remove_query_syntax_chars(query, replace=' '):
     return query_syntax_chars.sub(replace, query)
 
 
-def did_you_mean(query, tokens):
-    return query
-    # change = {}
-    # for t in tokens:
-    #     authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
-    #     if len(authors) > 0:
-    #         continue
-
-    #     if False:
-    #         if not dictionary.check(t):
-    #             try:
-    #                 change_to = dictionary.suggest(t)[0].lower()
-    #                 if change_to != t.lower():
-    #                     change[t] = change_to
-    #             except IndexError:
-    #                 pass
-
-    # if change == {}:
-    #     return None
-
-    # for frm, to in change.items():
-    #     query = query.replace(frm, to)
-
-    # return query
-
-
 @cache.never_cache
 def hint(request, mozhint=False, param='term'):
     prefix = request.GET.get(param, '')
@@ -133,212 +95,3 @@ def search(request):
                 ctx['hasresults'] = True
                 break
     return render(request, 'search/results.html', ctx)
-
-
-@cache.never_cache
-def main(request):
-    if request.EXPERIMENTS['layout'].value:
-        return search(request)
-
-    query = request.GET.get('q', '')
-
-    format = request.GET.get('format')
-    lang = request.GET.get('lang')
-    epoch = request.GET.get('epoch')
-    kind = request.GET.get('kind')
-    genre = request.GET.get('genre')
-
-    if len(query) < 2:
-        return render(
-            request, 'catalogue/search_too_short.html',
-            {'prefix': query})
-    elif len(query) > 256:
-        return render(
-            request, 'catalogue/search_too_long.html',
-            {'prefix': query})
-
-    query = prepare_query(query)
-    if not (format or lang or epoch or kind or genre):
-        pd_authors = search_pd_authors(query)
-    else:
-        pd_authors = []
-    if not format or format != 'obraz':
-        books = search_books(
-            query,
-            lang=lang,
-            only_audio=format=='audio',
-            only_synchro=format=='synchro',
-            epoch=epoch,
-            kind=kind,
-            genre=genre
-        )
-    else:
-        books = []
-    if (not format or format == 'obraz') and not lang:
-        pictures = search_pictures(
-            query,
-            epoch=epoch,
-            kind=kind,
-            genre=genre
-        )
-    else:
-        pictures = []
-    
-    suggestion = ''
-
-    if not (books or pictures or pd_authors):
-        form = PublishingSuggestForm(initial={"books": query + ", "})
-        return render(
-            request,
-            'catalogue/search_no_hits.html',
-            {
-                'form': form,
-                'did_you_mean': suggestion
-            })
-
-    if not (books or pictures) and len(pd_authors) == 1:
-        return HttpResponseRedirect(pd_authors[0].get_absolute_url())
-
-    return render(
-        request,
-        'catalogue/search_multiple_hits.html',
-        {
-            'pd_authors': pd_authors,
-            'books': books,
-            'pictures': pictures,
-            'did_you_mean': suggestion,
-            'set': {
-                'lang': lang,
-                'format': format,
-                'epoch': epoch,
-                'kind': kind,
-                'genre': genre,
-            },
-            'tags': {
-                'epoch': Tag.objects.filter(category='epoch', for_books=True),
-                'genre': Tag.objects.filter(category='genre', for_books=True),
-                'kind': Tag.objects.filter(category='kind', for_books=True),
-            },
-        })
-
-def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
-    search = Search()
-    results_parts = []
-    search_fields = []
-    words = query.split()
-    fieldsets = (
-        (['authors', 'authors_nonstem'], True),
-        (['title', 'title_nonstem'], True),
-        (['metadata', 'metadata_nonstem'], True),
-        (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
-    )
-    for fields, is_book in fieldsets:
-        search_fields += fields
-        results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
-    results = []
-    ids_results = {}
-    for results_part in results_parts:
-        for result in sorted(SearchResult.aggregate(results_part), reverse=True):
-            book_id = result.book_id
-            if book_id in ids_results:
-                ids_results[book_id].merge(result)
-            else:
-                results.append(result)
-                ids_results[book_id] = result
-    descendant_ids = set(
-        Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
-    results = [result for result in results if result.book_id not in descendant_ids]
-    for result in results:
-        search.get_snippets(result, query, num=3)
-
-    def ensure_exists(r):
-        try:
-            if not r.book:
-                return False
-        except Book.DoesNotExist:
-            return False
-
-        if lang and r.book.language != lang:
-            return False
-        if only_audio and not r.book.has_mp3_file():
-            return False
-        if only_synchro and not r.book.has_daisy_file():
-            return False
-        if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
-            return False
-        if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
-            return False
-        if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
-            return False
-
-        return True
-
-    results = [r for r in results if ensure_exists(r)]
-    return results
-
-
-def search_pictures(query, epoch=None, kind=None, genre=None):
-    search = Search()
-    results_parts = []
-    search_fields = []
-    words = query.split()
-    fieldsets = (
-        (['authors', 'authors_nonstem'], True),
-        (['title', 'title_nonstem'], True),
-        (['metadata', 'metadata_nonstem'], True),
-        (['themes_pl', 'themes_pl_nonstem'], False),
-    )
-    for fields, is_book in fieldsets:
-        search_fields += fields
-        results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
-    results = []
-    ids_results = {}
-    for results_part in results_parts:
-        for result in sorted(PictureResult.aggregate(results_part), reverse=True):
-            picture_id = result.picture_id
-            if picture_id in ids_results:
-                ids_results[picture_id].merge(result)
-            else:
-                results.append(result)
-                ids_results[picture_id] = result
-
-    def ensure_exists(r):
-        try:
-            if not r.picture:
-                return False
-        except Picture.DoesNotExist:
-            return False
-
-        if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
-            return False
-        if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
-            return False
-        if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
-            return False
-
-        return True
-
-    results = [r for r in results if ensure_exists(r)]
-    return results
-
-
-def search_pd_authors(query):
-    pd_authors = Author.objects.filter(name__icontains=query)
-    existing_slugs = Tag.objects.filter(
-        category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
-        .values_list('slug', flat=True)
-    pd_authors = pd_authors.exclude(slug__in=existing_slugs)
-    return pd_authors
-
-
-def prepare_query(query):
-    query = ' '.join(query.split())
-    # filter out private use characters
-    import unicodedata
-    query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
-    query = remove_query_syntax_chars(query)
-
-    words = query.split()
-    if len(words) > 10:
-        query = ' '.join(words[:10])
-    return query
index 113c1e7..f772d3d 100644 (file)
@@ -29,15 +29,6 @@ except NameError:
     CELERY_TASK_ALWAYS_EAGER = True
 
 
-# If SEARCH_INDEX not configured, disable the search.
-try:
-    SOLR
-except NameError:
-    NO_SEARCH_INDEX = True
-else:
-    NO_SEARCH_INDEX = False
-
-
 try:
     SENTRY_DSN
 except NameError:
index bbf684f..413adbe 100644 (file)
@@ -28,9 +28,6 @@ DATABASES = {
 
 DEFAULT_AUTO_FIELD = 'django.db.models.AutoField'
 
-SOLR_TEST = "http://localhost:8983/solr/wl_test/"
-SOLR_STOPWORDS = "/path/to/solr/data/conf/lang/stopwords_pl.txt"
-
 # Local time zone for this installation. Choices can be found here:
 # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
 # although not all choices may be available on all operating systems.
index f7fca47..51a5613 100644 (file)
@@ -68,6 +68,9 @@ CIVICRM_ACTIVITIES = {
 
 EXPERIMENTS_LAYOUT = 1
 EXPERIMENTS_SOWKA = 0
-EXPERIMENTS_SEARCH = 0
 
 WIDGETS = {}
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False
index 179467d..97dcec4 100644 (file)
@@ -8,7 +8,6 @@ from .paths import VAR_DIR
 # Example: "/home/media/media.lawrence.com/"
 MEDIA_ROOT = path.join(VAR_DIR, 'media/')
 STATIC_ROOT = path.join(VAR_DIR, 'static/')
-SEARCH_INDEX = path.join(VAR_DIR, 'search_index/')
 
 # URL that handles the media served from MEDIA_ROOT. Make sure to use a
 # trailing slash if there is a path component (optional in other cases).
index 57718b8..0e10be9 100644 (file)
@@ -6,3 +6,7 @@ from wolnelektury.settings import *
 THUMBNAIL_BACKEND = 'wolnelektury.test_utils.DummyThumbnailBackend'
 CATALOGUE_GET_MP3_LENGTH = 'catalogue.test_utils.get_mp3_length'
 MEDIA_URL = '/media/'
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False