From 3596cf9db6eabb5f0aa36afe7919bc40e8ff0b9a Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Thu, 15 Jun 2023 12:48:46 +0200 Subject: [PATCH] Remove legacy search. --- .gitignore | 1 + requirements/requirements.txt | 3 - src/api/tests/res/responses/collection.json | 1 + .../management/commands/importbooks.py | 15 - src/catalogue/models/book.py | 20 +- src/catalogue/signals.py | 7 - src/catalogue/tasks.py | 4 +- .../catalogue/search_multiple_hits.html | 130 --- .../templates/catalogue/search_no_hits.html | 29 - .../templates/catalogue/search_too_long.html | 16 - .../templates/catalogue/search_too_short.html | 16 - src/catalogue/test_utils.py | 3 +- src/catalogue/tests/test_book_import.py | 6 +- src/catalogue/tests/test_bookmedia.py | 4 +- src/catalogue/tests/test_tags.py | 82 +- src/club/forms.py | 3 +- ...43_monthlyamount_wide_singleamount_wide.py | 5 +- src/opds/tests/test_opds.py | 8 +- src/opds/views.py | 60 +- src/pdcounter/models.py | 12 - src/picture/models.py | 18 +- src/picture/tasks.py | 13 - src/search/custom.py | 154 --- src/search/forms.py | 6 +- src/search/index.py | 1002 +---------------- src/search/management/__init__.py | 0 src/search/management/commands/__init__.py | 0 src/search/management/commands/reindex.py | 105 -- .../management/commands/reindex_pictures.py | 81 -- src/search/management/commands/snippets.py | 23 - src/search/mock_search.py | 40 - src/search/tests/index.py | 35 +- src/search/urls.py | 2 +- src/search/utils.py | 17 +- src/search/views.py | 247 ---- src/wolnelektury/settings/__init__.py | 9 - src/wolnelektury/settings/basic.py | 3 - src/wolnelektury/settings/custom.py | 5 +- src/wolnelektury/settings/static.py | 1 - src/wolnelektury/settings/test.py | 4 + 40 files changed, 177 insertions(+), 2013 deletions(-) delete mode 100644 src/catalogue/templates/catalogue/search_multiple_hits.html delete mode 100644 src/catalogue/templates/catalogue/search_no_hits.html delete mode 100644 src/catalogue/templates/catalogue/search_too_long.html delete mode 100644 src/catalogue/templates/catalogue/search_too_short.html delete mode 100644 src/search/custom.py delete mode 100644 src/search/management/__init__.py delete mode 100644 src/search/management/commands/__init__.py delete mode 100644 src/search/management/commands/reindex.py delete mode 100644 src/search/management/commands/reindex_pictures.py delete mode 100644 src/search/management/commands/snippets.py delete mode 100644 src/search/mock_search.py diff --git a/.gitignore b/.gitignore index 99c089870..3d746f904 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ coverage.xml pip-log.txt nosetests.xml /htmlcov +.python-version # Mac OS X garbage .DS_Store diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 99e11fa09..86afdfcec 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -49,9 +49,6 @@ celery[redis]==5.2.7 #pyoai==2.5.1 -e git+https://github.com/infrae/pyoai@5ff2f15e869869e70d8139e4c37b7832854d7049#egg=pyoai -scorched==0.13 -httplib2 - sentry-sdk==0.10.2 requests diff --git a/src/api/tests/res/responses/collection.json b/src/api/tests/res/responses/collection.json index 299246112..0e385dcaf 100644 --- a/src/api/tests/res/responses/collection.json +++ b/src/api/tests/res/responses/collection.json @@ -1,5 +1,6 @@ { "url": "http://testserver/katalog/lektury/a-collection/", + "authors": [], "books": [ { "kind": "Liryka", diff --git a/src/catalogue/management/commands/importbooks.py b/src/catalogue/management/commands/importbooks.py index e9b33642c..9322eea0b 100644 --- a/src/catalogue/management/commands/importbooks.py +++ b/src/catalogue/management/commands/importbooks.py @@ -12,7 +12,6 @@ from librarian.picture import ImageStore from catalogue.models import Book from picture.models import Picture -from search.index import Index class Command(BaseCommand): @@ -28,10 +27,6 @@ class Command(BaseCommand): parser.add_argument( '-D', '--dont-build', dest='dont_build', metavar="FORMAT,...", help="Skip building specified formats") - parser.add_argument( - '-S', '--no-search-index', action='store_false', - dest='search_index', default=True, - help='Skip indexing imported works for search') parser.add_argument( '-F', '--not-findable', action='store_false', dest='findable', default=True, @@ -50,7 +45,6 @@ class Command(BaseCommand): file_base, ext = os.path.splitext(file_path) book = Book.from_xml_file(file_path, overwrite=options.get('force'), dont_build=dont_build, - search_index_tags=False, findable=options.get('findable'), remote_gallery_url='file://' + os.path.dirname(os.path.abspath(file_base)) + '/img/' ) @@ -84,15 +78,6 @@ class Command(BaseCommand): verbose = options.get('verbose') import_picture = options.get('import_picture') - if options.get('search_index') and not settings.NO_SEARCH_INDEX: - index = Index() - try: - index.index_tags() - index.index.commit() - except Exception as e: - index.index.rollback() - raise e - files_imported = 0 files_skipped = 0 diff --git a/src/catalogue/models/book.py b/src/catalogue/models/book.py index 85cfd63b9..bcbefeaa1 100644 --- a/src/catalogue/models/book.py +++ b/src/catalogue/models/book.py @@ -529,21 +529,11 @@ class Book(models.Model): }) return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme}) - def search_index(self, book_info=None, index=None, index_tags=True, commit=True): + def search_index(self, index=None): if not self.findable: return - if index is None: - from search.index import Index - index = Index() - try: - index.index_book(self, book_info) - if index_tags: - index.index_tags() - if commit: - index.index.commit() - except Exception as e: - index.index.rollback() - raise e + from search.index import Index + Index.index_book(self) # will make problems in conjunction with paid previews def download_pictures(self, remote_gallery_url): @@ -603,7 +593,7 @@ class Book(models.Model): @classmethod def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True, - search_index_tags=True, remote_gallery_url=None, days=0, findable=True): + remote_gallery_url=None, days=0, findable=True): from catalogue import tasks if dont_build is None: @@ -712,7 +702,7 @@ class Book(models.Model): getattr(book, '%s_file' % format_).build_delay() if not settings.NO_SEARCH_INDEX and search_index and findable: - tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags) + tasks.index_book.delay(book.id) for child in notify_cover_changed: child.parent_cover_changed() diff --git a/src/catalogue/signals.py b/src/catalogue/signals.py index 72f8a8910..81c0b9ce6 100644 --- a/src/catalogue/signals.py +++ b/src/catalogue/signals.py @@ -53,13 +53,6 @@ def book_save(sender, instance, **kwargs): def book_delete(sender, instance, **kwargs): caches[settings.CACHE_MIDDLEWARE_ALIAS].clear() - if not settings.NO_SEARCH_INDEX: - # remove the book from search index, when it is deleted. - from search.index import Index - idx = Index() - idx.remove_book(instance) - idx.index_tags() - #### # Tag diff --git a/src/catalogue/tasks.py b/src/catalogue/tasks.py index b2308bbdf..0694b0174 100644 --- a/src/catalogue/tasks.py +++ b/src/catalogue/tasks.py @@ -32,9 +32,9 @@ def build_field(pk, field_name): @shared_task -def index_book(book_id, book_info=None, **kwargs): +def index_book(book_id, **kwargs): try: - return Book.objects.get(id=book_id).search_index(book_info, **kwargs) + return Book.objects.get(id=book_id).search_index(**kwargs) except Exception as e: print("Exception during index: %s" % e) print_exc() diff --git a/src/catalogue/templates/catalogue/search_multiple_hits.html b/src/catalogue/templates/catalogue/search_multiple_hits.html deleted file mode 100644 index 937b9266d..000000000 --- a/src/catalogue/templates/catalogue/search_multiple_hits.html +++ /dev/null @@ -1,130 +0,0 @@ -{% extends "base/base.html" %} -{% load i18n %} -{% load pagination_tags %} -{% load inline_tag_list from catalogue_tags %} -{% load book_searched from search_tags %} -{% load set_get_parameter %} - -{% block titleextra %}{% trans "Search" %}{% endblock %} - -{% block bodyid %}tagged-object-list{% endblock %} - -{% block body %} - -

{% trans "Search" %}

- -
- -

- format: - {% if not set.format %}dowolny{% else %}dowolny{% endif %} - {% if set.format == "tekst" %}tekst{% else %}tekst{% endif %} - {% if set.format == "audio" %}audiobook{% else %}audiobook{% endif %} - {% if set.format == "synchro" %}DAISY{% else %}DAISY{% endif %} - {% if set.format == "obraz" %}obraz{% else %}obraz{% endif %} -

- -

- {% trans "language" %}: - {% if not set.lang %}dowolny{% else %}dowolny{% endif %} - {% if set.lang == "pol" %}polski{% else %}polski{% endif %} - {% if set.lang == "eng" %}angielski{% else %}angielski{% endif %} - {% if set.lang == "fre" %}francuski{% else %}francuski{% endif %} - {% if set.lang == "ger" %}niemiecki{% else %}niemiecki{% endif %} - {% if set.lang == "lit" %}litewski{% else %}litewski{% endif %} -

- -

-

- {% trans "epoch" %}: - {% if not set.epoch %}dowolna{% else %}dowolna{% endif %} - - {% for tag in tags.epoch %} - {% if set.epoch == tag.slug %} - {{ tag.name }} - {% else %} - - {{ tag.name }} - - {% endif %} - {% endfor %} -

-

- {% trans "kind" %}: - {% if not set.kind %}dowolny{% else %}dowolny{% endif %} - {% for tag in tags.kind %} - {% if set.kind == tag.slug %} - {{ tag.name }} - {% else %} - - {{ tag.name }} - - {% endif %} - {% endfor %} -

- - {% comment %} -

- {% trans "genre" %}: - {% if not set.genre %}dowolny{% else %}dowolny{% endif %} - {% for tag in tags.genre %} - {% if set.genre == tag.slug %} - {{ tag.name }} - {% else %} - - {{ tag.name }} - - {% endif %} - {% endfor %} -

- {% endcomment %} -
- - {% if did_you_mean %} - {% trans "Did you mean" %} - {{did_you_mean|lower}}? - {% endif %} -
- {% if pd_authors %} -
- {% for author in pd_authors %} -
- {% include "pdcounter/pd_author_box.html" %} -
- {% endfor %} -
- {% endif %} -
- -
- {% if books %} - - {% endif %} - - {% if pictures %} -

{% trans "Art" %}

- - {% endif %} -
-{% endblock %} diff --git a/src/catalogue/templates/catalogue/search_no_hits.html b/src/catalogue/templates/catalogue/search_no_hits.html deleted file mode 100644 index 3f9e98203..000000000 --- a/src/catalogue/templates/catalogue/search_no_hits.html +++ /dev/null @@ -1,29 +0,0 @@ -{% extends "base/base.html" %} -{% load i18n %} -{% load catalogue_tags pagination_tags %} - -{% block titleextra %}{% trans "Search" %}{% endblock %} - -{% block bodyid %}tagged-object-list{% endblock %} - -{% block body %} -

{% trans "Search" %}

- -
-
-

- {% if did_you_mean %} - {% trans "Did you mean" %} - {{did_you_mean|lower}}? - {% endif %} -

-

{% trans "Sorry! Search cirteria did not match any resources." %}

- - {% include "info/join_us.html" %} -
-
- -
- {% include "publishing_suggest.html" %} -
-{% endblock %} diff --git a/src/catalogue/templates/catalogue/search_too_long.html b/src/catalogue/templates/catalogue/search_too_long.html deleted file mode 100644 index 4f780dfbe..000000000 --- a/src/catalogue/templates/catalogue/search_too_long.html +++ /dev/null @@ -1,16 +0,0 @@ -{% extends "base/base.html" %} -{% load i18n %} -{% load catalogue_tags pagination_tags %} - -{% block titleextra %}{% trans "Search" %}{% endblock %} - -{% block bodyid %}tagged-object-list{% endblock %} - -{% block body %} -

{% trans "Search" %}

- -
-

{% trans "Sorry! Search query is too long to be processed." %}

- {% include "info/join_us.html" %} -
-{% endblock %} \ No newline at end of file diff --git a/src/catalogue/templates/catalogue/search_too_short.html b/src/catalogue/templates/catalogue/search_too_short.html deleted file mode 100644 index 253a94b5c..000000000 --- a/src/catalogue/templates/catalogue/search_too_short.html +++ /dev/null @@ -1,16 +0,0 @@ -{% extends "base/base.html" %} -{% load i18n %} -{% load catalogue_tags pagination_tags %} - -{% block titleextra %}{% trans "Search" %}{% endblock %} - -{% block bodyid %}tagged-object-list{% endblock %} - -{% block body %} -

{% trans "Search" %}

- -
-

{% trans "Sorry! Search query must have at least two characters." %}

- {% include "info/join_us.html" %} -
-{% endblock %} \ No newline at end of file diff --git a/src/catalogue/test_utils.py b/src/catalogue/test_utils.py index 6bc5569c6..c15cba717 100644 --- a/src/catalogue/test_utils.py +++ b/src/catalogue/test_utils.py @@ -19,7 +19,6 @@ from librarian import WLURI CACHES={ 'default': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, }, - SOLR=settings.SOLR_TEST, ) class WLTestCase(TestCase): """ @@ -74,7 +73,7 @@ def info_args(title, language=None): language = 'pol' return { 'title': str(title), - 'url': WLURI.from_slug(slug), + 'url': WLURI(slug), 'about': "http://wolnelektury.pl/example/URI/%s" % slug, 'language': language, } diff --git a/src/catalogue/tests/test_book_import.py b/src/catalogue/tests/test_book_import.py index f8900c8a3..5f9627fc5 100644 --- a/src/catalogue/tests/test_book_import.py +++ b/src/catalogue/tests/test_book_import.py @@ -14,7 +14,7 @@ class BookImportLogicTests(WLTestCase): def setUp(self): WLTestCase.setUp(self) self.book_info = BookInfoStub( - url=WLURI.from_slug("default-book"), + url=WLURI("default-book"), about="http://wolnelektury.pl/example/URI/default_book", title="Default Book", author=PersonStub(("Jim",), "Lazy"), @@ -121,7 +121,7 @@ class BookImportLogicTests(WLTestCase): def test_book_with_invalid_slug(self): """ Book with invalid characters in slug shouldn't be imported """ - self.book_info.url = WLURI.from_slug("default_book") + self.book_info.url = WLURI("default_book") book_text = "" with self.assertRaises(ValueError): models.Book.from_text_and_meta(ContentFile(book_text), self.book_info) @@ -375,7 +375,7 @@ class TreeImportTest(WLTestCase): class MultilingualBookImportTest(WLTestCase): def setUp(self): WLTestCase.setUp(self) - common_uri = WLURI.from_slug('common-slug') + common_uri = WLURI('common-slug') self.pol_info = BookInfoStub( genre='X-Genre', diff --git a/src/catalogue/tests/test_bookmedia.py b/src/catalogue/tests/test_bookmedia.py index edd17a5d4..795282634 100644 --- a/src/catalogue/tests/test_bookmedia.py +++ b/src/catalogue/tests/test_bookmedia.py @@ -4,7 +4,7 @@ from os.path import basename, exists from unittest import skip -from django.core.files.base import ContentFile +from django.core.files.base import ContentFile, File from catalogue.test_utils import * from catalogue import models, utils @@ -17,6 +17,8 @@ class BookMediaTests(WLTestCase): self.file = ContentFile(b'X') self.file2 = ContentFile(b'Y') self.book = models.Book.objects.create(slug='test-book', title='Test') + with open(join(dirname(__file__), "files/fraszka-do-anusie.xml")) as f: + self.book.xml_file.save(None, File(f)) def set_title(self, title): self.book.title = title diff --git a/src/catalogue/tests/test_tags.py b/src/catalogue/tests/test_tags.py index a7066186c..0853a4212 100644 --- a/src/catalogue/tests/test_tags.py +++ b/src/catalogue/tests/test_tags.py @@ -107,73 +107,88 @@ class TagRelatedTagsTests(WLTestCase): def test_empty(self): """ empty tag should have no related tags """ - cats = self.client.get('/katalog/autor/empty/').context['categories'] - self.assertEqual({k: v for (k, v) in cats.items() if v}, {}, 'tags related to empty tag') + suggested = self.client.get('/katalog/autor/empty/').context['suggested_tags'] + self.assertEqual(suggested, [], 'tags related to empty tag') def test_has_related(self): """ related own and descendants' tags should be generated """ - cats = self.client.get('/katalog/rodzaj/kind/').context['categories'] - self.assertTrue('Common Man' in [tag.name for tag in cats['author']], + suggested = { + (t.name, t.category) + for t in self.client.get('/katalog/rodzaj/kind/').context['suggested_tags'] + } + self.assertTrue(('Common Man', 'author') in suggested, 'missing `author` related tag') - self.assertTrue('Epoch' in [tag.name for tag in cats['epoch']], + self.assertTrue(('Epoch', 'epoch') in suggested, 'missing `epoch` related tag') - self.assertFalse(cats.get("kind", False), + # TODO: this should probably be changed now. + self.assertFalse(any(x for x in suggested if x[1] == "kind"), "There should be no child-only related `kind` tags") - self.assertTrue("Genre" in [tag.name for tag in cats['genre']], + self.assertTrue(("Genre", 'genre') in suggested, 'missing `genre` related tag') - self.assertFalse("ChildGenre" in [tag.name for tag in cats['genre']], + # TODO: this should probably be changed now. + self.assertFalse(("ChildGenre", 'genre') in suggested, "There should be no child-only related `genre` tags") - self.assertTrue("GchildGenre" in [tag.name for tag in cats['genre']], + self.assertTrue(("GchildGenre", "genre") in suggested, "missing grandchild's related tag") - self.assertTrue('Theme' in [tag.name for tag in cats['theme']], + self.assertTrue(('Theme', 'theme') in suggested, "missing related theme") - self.assertFalse('Child1Theme' in [tag.name for tag in cats['theme']], - "There should be no child-only related `theme` tags") - self.assertTrue('GChildTheme' in [tag.name for tag in cats['theme']], + self.assertTrue(('Child1Theme', 'theme') in suggested, + "missing child's related theme") + self.assertTrue(('GChildTheme', 'theme') in suggested, "missing grandchild's related theme") def test_related_differ(self): """ related tags shouldn't include filtering tags """ response = self.client.get('/katalog/rodzaj/kind/') - cats = response.context['categories'] - self.assertFalse(cats.get('kind', False), + suggested = response.context['suggested_tags'] + self.assertFalse(any(x for x in suggested if x.category == 'kind'), 'filtering tag wrongly included in related') - cats = self.client.get('/katalog/motyw/theme/').context['categories'] - self.assertFalse('Theme' in [tag.name for tag in cats['theme']], + suggested = { + (t.name, t.category) + for t in self.client.get( + '/katalog/motyw/theme/').context['suggested_tags'] + } + self.assertFalse(('Theme', 'theme') in suggested, 'filtering theme wrongly included in related') def test_parent_tag_once(self): """ if parent and descendants have a common tag, count it only once """ - cats = self.client.get('/katalog/rodzaj/kind/').context['categories'] - self.assertEqual([(tag.name, tag.count) for tag in cats['epoch']], + suggested = self.client.get('/katalog/rodzaj/kind/').context['suggested_tags'] + self.assertEqual([(tag.name, tag.count) for tag in suggested if tag.category == 'epoch'], [('Epoch', 1)], 'wrong related tag epoch tag on tag page') def test_siblings_tags_count(self): """ if children have tags and parent hasn't, count the children """ - cats = self.client.get('/katalog/epoka/epoch/').context['categories'] + suggested = self.client.get('/katalog/epoka/epoch/').context['suggested_tags'] + kinds = [(tag.name, tag.count) for tag in suggested if tag.category == 'kind'] self.assertTrue( - ('ChildKind', 2) in [(tag.name, tag.count) for tag in cats['kind']], - 'wrong related kind tags on tag page, got: ' + - str([(tag.name, tag.count) for tag in cats['kind']])) + ('ChildKind', 2) in kinds, + 'wrong related kind tags on tag page' + ) # all occurencies of theme should be counted - self.assertTrue(('Theme', 4) in [(tag.name, tag.count) for tag in cats['theme']], - 'wrong related theme count') + themes = [(tag.name, tag.count) for tag in suggested if tag.category == 'theme'] + self.assertTrue( + ('Theme', 4) in themes, + 'wrong related theme count' + ) def test_query_child_tag(self): """ If child and parent have a common tag, but parent isn't included in the result, child should still count. """ - cats = self.client.get('/katalog/gatunek/childgenre/').context['categories'] - self.assertTrue(('Epoch', 2) in [(tag.name, tag.count) for tag in cats['epoch']], - 'wrong related kind tags on tag page, got: ' + - str([(tag.name, tag.count) for tag in cats['epoch']])) + suggested = self.client.get('/katalog/gatunek/childgenre/').context['suggested_tags'] + epochs = [(tag.name, tag.count) for tag in suggested if tag.category == 'epoch'] + self.assertTrue( + ('Epoch', 2) in epochs, + 'wrong related kind tags on tag page' + ) class CleanTagRelationTests(WLTestCase): @@ -198,8 +213,8 @@ class CleanTagRelationTests(WLTestCase): """ there should be no related tags left after deleting some objects """ models.Book.objects.all().delete() - cats = self.client.get('/katalog/rodzaj/k/').context['categories'] - self.assertEqual({k: v for (k, v) in cats.items() if v}, {}) + suggested = self.client.get('/katalog/rodzaj/k/').context['suggested_tags'] + self.assertEqual(suggested, []) self.assertEqual(models.Fragment.objects.all().count(), 0, "orphaned fragments left") self.assertEqual(models.Tag.intermediary_table_model.objects.all().count(), 0, @@ -248,10 +263,11 @@ class TestIdenticalTag(WLTestCase): self.book_info) categories = {'author': 'autor', 'theme': 'motyw', 'epoch': 'epoka', 'kind': 'rodzaj', 'genre': 'gatunek'} for cat, localcat in categories.items(): + if cat == 'theme': continue context = self.client.get('/katalog/%s/tag/' % localcat).context self.assertEqual(1, len(context['object_list'])) - self.assertNotEqual({}, context['categories']) - self.assertFalse(context['categories'].get(cat, False)) + self.assertNotEqual([], context['suggested_tags']) + self.assertFalse(any(t for t in context['suggested_tags'] if t.category == cat)) class BookTagsTests(WLTestCase): diff --git a/src/club/forms.py b/src/club/forms.py index 5c2d3018f..098a70edf 100644 --- a/src/club/forms.py +++ b/src/club/forms.py @@ -138,7 +138,8 @@ class DonationStep1Form(forms.ModelForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) club = models.Club.objects.first() - self.fields['custom_amount'].widget.attrs['min'] = club.min_amount + if club is not None: + self.fields['custom_amount'].widget.attrs['min'] = club.min_amount def clean(self): state = {} diff --git a/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py b/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py index 0c696b280..c73450c3a 100644 --- a/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py +++ b/src/club/migrations/0043_monthlyamount_wide_singleamount_wide.py @@ -6,8 +6,9 @@ from django.db import migrations, models def last_amount_wide(apps, schema_editor): SingleAmount = apps.get_model('club', 'SingleAmount') a = SingleAmount.objects.last() - a.wide = True - a.save() + if a is not None: + a.wide = True + a.save() class Migration(migrations.Migration): diff --git a/src/opds/tests/test_opds.py b/src/opds/tests/test_opds.py index 2c37bd46c..e86b86588 100644 --- a/src/opds/tests/test_opds.py +++ b/src/opds/tests/test_opds.py @@ -1,26 +1,20 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from unittest import skipIf from lxml import etree from django.conf import settings import catalogue from catalogue.test_utils import WLTestCase, get_fixture from catalogue.models import Book from librarian import WLURI, XMLNamespace -from search.index import Index AtomNS = XMLNamespace("http://www.w3.org/2005/Atom") -@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False), 'Requires search server and NO_SEARCH_INDEX=False.') class OpdsSearchTests(WLTestCase): """Tests search feed in OPDS..""" def setUp(self): WLTestCase.setUp(self) - index = Index() - index.index.delete_all() - index.index.commit() self.do_doktora = Book.from_xml_file( get_fixture('do-doktora.xml')) @@ -32,7 +26,7 @@ class OpdsSearchTests(WLTestCase): tree = etree.fromstring( self.client.get('/opds/search/?%s' % query).content) elem_ids = tree.findall('.//%s/%s' % (AtomNS('entry'), AtomNS('id'))) - slugs = [WLURI(elem.text).slug for elem in elem_ids] + slugs = [WLURI.from_text(elem.text).slug for elem in elem_ids] self.assertEqual(set(slugs), set(b.slug for b in books), "OPDS search '%s' failed." % query) def test_opds_search_simple(self): diff --git a/src/opds/views.py b/src/opds/views.py index 8e929c6bc..63c79a2f5 100644 --- a/src/opds/views.py +++ b/src/opds/views.py @@ -16,8 +16,8 @@ from django.utils.functional import lazy from basicauth import logged_in_or_basicauth, factory_decorator from catalogue.models import Book, Tag +from search.utils import UnaccentSearchQuery, UnaccentSearchVector -from search.views import Search import operator import logging import re @@ -350,15 +350,6 @@ class SearchFeed(AcquisitionFeed): 'text': (10, 11), } - PARAMS_TO_FIELDS = { - 'author': 'authors', - 'translator': 'translators', - # 'title': 'title', - 'categories': 'tag_name_pl', - 'description': 'text', - # 'text': 'text', - } - ATOM_PLACEHOLDER = re.compile(r"^{(atom|opds):\w+}$") def get_object(self, request): @@ -413,30 +404,33 @@ class SearchFeed(AcquisitionFeed): # query is set above. log.debug("Inline query = [%s], criteria: %s" % (query, criteria)) - srch = Search() - - book_hit_filter = srch.index.Q(book_id__any=True) - filters = [book_hit_filter] + [srch.index.Q( - **{self.PARAMS_TO_FIELDS.get(cn, cn): criteria[cn]} - ) for cn in self.MATCHES.keys() if cn in criteria - if criteria[cn]] - + books = Book.objects.filter(findable=True).annotate( + search_vector=UnaccentSearchVector('title') + ) if query: - q = srch.index.query( - reduce( - operator.or_, - [srch.index.Q(**{self.PARAMS_TO_FIELDS.get(cn, cn): query}) for cn in self.MATCHES.keys()], - srch.index.Q())) - else: - q = srch.index.query(srch.index.Q()) - - q = srch.apply_filters(q, filters).field_limit(score=True, fields=['book_id']) - results = q.execute() - - book_scores = dict([(r['book_id'], r['score']) for r in results]) - books = Book.objects.filter(findable=True, id__in=set([r['book_id'] for r in results])) - books = list(books) - books.sort(reverse=True, key=lambda book: book_scores[book.id]) + squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG) + books = books.filter(search_vector=squery) + if criteria['author']: + authors = Tag.objects.filter(category='author').annotate( + search_vector=UnaccentSearchVector('name_pl') + ).filter(search_vector=UnaccentSearchQuery(criteria['author'], config=settings.SEARCH_CONFIG)) + books = books.filter(tag_relations__tag__in=authors) + if criteria['categories']: + tags = Tag.objects.filter(category__in=('genre', 'kind', 'epoch')).annotate( + search_vector=UnaccentSearchVector('name_pl') + ).filter(search_vector=UnaccentSearchQuery(criteria['categories'], config=settings.SEARCH_CONFIG)) + books = books.filter(tag_relations__tag__in=tags) + if criteria['translator']: + # TODO + pass + if criteria['title']: + books = books.filter( + search_vector=UnaccentSearchQuery(criteria['title'], config=settings.SEARCH_CONFIG) + ) + + books = books.exclude(ancestor__in=books) + + books = books.order_by('popularity__count') return books def get_link(self, query): diff --git a/src/pdcounter/models.py b/src/pdcounter/models.py index 2e1e0b90e..5e94d5ec1 100644 --- a/src/pdcounter/models.py +++ b/src/pdcounter/models.py @@ -110,15 +110,3 @@ class BookStub(models.Model): def pretty_title(self, html_links=False): return ', '.join((self.author, self.title)) - - -if not settings.NO_SEARCH_INDEX: - def update_index(sender, instance, **kwargs): - from search.index import Index - idx = Index() - idx.index_tags(instance, remove_only='created' not in kwargs) - - post_delete.connect(update_index, Author) - post_delete.connect(update_index, BookStub) - post_save.connect(update_index, Author) - post_save.connect(update_index, BookStub) diff --git a/src/picture/models.py b/src/picture/models.py index b9ddcae9b..2dadd0c78 100644 --- a/src/picture/models.py +++ b/src/picture/models.py @@ -180,7 +180,7 @@ class Picture(models.Model): return None @classmethod - def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False, search_index=True): + def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False): """ Import xml and it's accompanying image file. If image file is missing, it will be fetched by librarian.picture.ImageStore @@ -305,8 +305,6 @@ class Picture(models.Model): picture.xml_file.save("%s.xml" % picture.slug, File(xml_file)) picture.save() tasks.generate_picture_html(picture.id) - if not settings.NO_SEARCH_INDEX and search_index: - tasks.index_picture.delay(picture.id, picture_info=picture_xml.picture_info) if close_xml_file: xml_file.close() @@ -378,17 +376,3 @@ class Picture(models.Model): def clear_cache(self): clear_cached_renders(self.mini_box) clear_cached_renders(self.midi_box) - - def search_index(self, picture_info=None, index=None, index_tags=True, commit=True): - if index is None: - from search.index import Index - index = Index() - try: - index.index_picture(self, picture_info) - if index_tags: - index.index_tags() - if commit: - index.index.commit() - except Exception as e: - index.index.rollback() - raise e diff --git a/src/picture/tasks.py b/src/picture/tasks.py index ff9aa1357..86b982932 100644 --- a/src/picture/tasks.py +++ b/src/picture/tasks.py @@ -2,8 +2,6 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # import json -from traceback import print_exc - from celery import shared_task from django.core.files.base import ContentFile from django.template.loader import render_to_string @@ -20,14 +18,3 @@ def generate_picture_html(picture_id): 'themes': areas_json['themes'], }) pic.html_file.save("%s.html" % pic.slug, ContentFile(html_text)) - - -@shared_task -def index_picture(picture_id, picture_info=None, **kwargs): - from picture.models import Picture - try: - return Picture.objects.get(id=picture_id).search_index(picture_info, **kwargs) - except Exception as e: - print("Exception during index: %s" % e) - print_exc() - raise e diff --git a/src/search/custom.py b/src/search/custom.py deleted file mode 100644 index 933715719..000000000 --- a/src/search/custom.py +++ /dev/null @@ -1,154 +0,0 @@ -# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -import re -from urllib.parse import urlencode -import warnings -from httplib2 import socket -from lxml import etree -from scorched import connection, exc, search - - -class CustomSolrConnection(connection.SolrConnection): - def __init__(self, *args, **kw): - super(CustomSolrConnection, self).__init__(*args, **kw) - self.analysis_url = self.url + "analysis/field/" - - def analyze(self, params): - qs = urlencode(params) - url = "%s?%s" % (self.analysis_url, qs) - if len(url) > self.max_length_get_url: - warnings.warn("Long query URL encountered - POSTing instead of GETting. " - "This query will not be cached at the HTTP layer") - url = self.analysis_url - kwargs = dict( - method="POST", - data=qs, - headers={"Content-Type": "application/x-www-form-urlencoded"}, - ) - else: - kwargs = dict(method="GET") - response = self.request(url=url, **kwargs) - if response.status_code != 200: - raise exc.SolrError(response) - return response.content - - -class CustomSolrInterface(connection.SolrInterface): - # just copied from parent and SolrConnection -> CustomSolrConnection - def __init__(self, url, http_connection=None, mode='', - retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL, - search_timeout=()): - """ - :param url: url to Solr - :type url: str - :param http_connection: optional -- already existing connection - :type http_connection: requests connection - :param mode: optional -- mode (readable, writable) Solr - :type mode: str - :param retry_timeout: optional -- timeout until retry - :type retry_timeout: int - :param max_length_get_url: optional -- max length until switch to post - :type max_length_get_url: int - :param search_timeout: (optional) How long to wait for the server to - send data before giving up, as a float, or a - (connect timeout, read timeout) tuple. - :type search_timeout: float or tuple - """ - - self.conn = CustomSolrConnection( - url, http_connection, mode, retry_timeout, max_length_get_url) - self.schema = self.init_schema() - self._datefields = self._extract_datefields(self.schema) - - - def _analyze(self, **kwargs): - if not self.conn.readable: - raise TypeError("This Solr instance is only for writing") - args = { - 'analysis_showmatch': True - } - if 'field' in kwargs: - args['analysis_fieldname'] = kwargs['field'] - if 'text' in kwargs: - args['analysis_fieldvalue'] = kwargs['text'] - if 'q' in kwargs: - args['q'] = kwargs['q'] - if 'query' in kwargs: - args['q'] = kwargs['q'] - - params = [ - (k.replace('_', '.'), v) - for (k, v) in search.params_from_dict(**args) - ] - - content = self.conn.analyze(params) - doc = etree.fromstring(content) - return doc - - def highlight(self, **kwargs): - doc = self._analyze(**kwargs) - analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']") - matches = set() - for wrd in analyzed: - start = int(wrd.xpath("int[@name='start']")[0].text) - end = int(wrd.xpath("int[@name='end']")[0].text) - matches.add((start, end)) - - if matches: - return self.substring( - kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("", ""))) - else: - return None - - def analyze(self, **kwargs): - doc = self._analyze(**kwargs) - terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]") - terms = map(lambda n: str(n.text), terms) - return terms - - def expand_margins(self, text, start, end): - totlen = len(text) - - def is_boundary(x): - ws = re.compile(r"\W", re.UNICODE) - return bool(ws.match(x)) - - while start > 0: - if is_boundary(text[start - 1]): - break - start -= 1 - - while end < totlen - 1: - if is_boundary(text[end + 1]): - break - end += 1 - - return start, end - - def substring(self, text, matches, margins=30, mark=("", "")): - totlen = len(text) - matches_margins = [ - ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches] - - # lets start with first match - (start, end) = matches_margins[0][1] - new_matches = [matches_margins[0][0]] - - for (m, (s, e)) in matches_margins[1:]: - if end < s or start > e: - continue - start = min(start, s) - end = max(end, e) - new_matches.append(m) - - snip = text[start:end] - new_matches.sort(key=lambda a: -a[0]) - - for (s, e) in new_matches: - off = -start - snip = snip[:e + off] + mark[1] + snip[e + off:] - snip = snip[:s + off] + mark[0] + snip[s + off:] - snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip) - - return snip diff --git a/src/search/forms.py b/src/search/forms.py index 176c73ee8..3f6c99b75 100644 --- a/src/search/forms.py +++ b/src/search/forms.py @@ -158,8 +158,8 @@ class SearchFilters(forms.Form): def results(self): qs = self.get_querysets() query = self.cleaned_data['q'] - squery = UnaccentSearchQuery(query, config='polish') - query = SearchQuery(query, config='polish') + squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG) + query = SearchQuery(query, config=settings.SEARCH_CONFIG) books = qs['book'].annotate( search_vector=UnaccentSearchVector('title') ).filter(search_vector=squery) @@ -169,7 +169,7 @@ class SearchFilters(forms.Form): headline=SearchHeadline( 'text', query, - config='polish', + config=settings.SEARCH_CONFIG, start_sel='', stop_sel='', ) diff --git a/src/search/index.py b/src/search/index.py index 4606f57db..fc9e9d54c 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -1,299 +1,15 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from functools import reduce, total_ordering -from itertools import chain -import logging -import operator -import os import re -from django.conf import settings -from librarian import dcparser -import librarian.meta.types.person -import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree -import scorched -import catalogue.models -import picture.models -from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook -from wolnelektury.utils import makedirs -from . import custom -log = logging.getLogger('search') - -if os.path.isfile(settings.SOLR_STOPWORDS): - stopwords = set( - line.strip() - for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#')) -else: - stopwords = set() - - -class SolrIndex(object): - def __init__(self, mode=None): - self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode) - - -class Snippets(object): - """ - This class manages snippet files for indexed object (book) - the snippets are concatenated together, and their positions and - lengths are kept in lucene index fields. - """ - SNIPPET_DIR = "snippets" - - def __init__(self, book_id, revision=None): - makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR)) - self.book_id = book_id - self.revision = revision - self.file = None - self.position = None - - @property - def path(self): - if self.revision: - fn = "%d.%d" % (self.book_id, self.revision) - else: - fn = "%d" % self.book_id - - return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn) - - def open(self, mode='r'): - """ - Open the snippet file. Call .close() afterwards. - """ - if 'b' not in mode: - mode += 'b' - - if 'w' in mode: - if os.path.exists(self.path): - self.revision = 1 - while True: - if not os.path.exists(self.path): - break - self.revision += 1 - - self.file = open(self.path, mode) - self.position = 0 - return self - - def add(self, snippet): - """ - Append a snippet (unicode) to the snippet file. - Return a (position, length) tuple - """ - txt = snippet.encode('utf-8') - l = len(txt) - self.file.write(txt) - pos = (self.position, l) - self.position += l - return pos - - def get(self, pos): - """ - Given a tuple of (position, length) return an unicode - of the snippet stored there. - """ - self.file.seek(pos[0], 0) - try: - txt = self.file.read(pos[1]).decode('utf-8') - except: - return '' - return txt - - def close(self): - """Close snippet file""" - if self.file: - self.file.close() - - def remove(self): - self.revision = None - try: - os.unlink(self.path) - self.revision = 0 - while True: - self.revision += 1 - os.unlink(self.path) - except OSError: - pass - - -class Index(SolrIndex): +class Index: """ Class indexing books. """ - def __init__(self): - super(Index, self).__init__(mode='rw') - - def remove_snippets(self, book): - book.snippet_set.all().delete() - - def add_snippet(self, book, doc): - assert book.id == doc.pop('book_id') - # Fragments already exist and can be indexed where they live. - if 'fragment_anchor' in doc: - return - - text = doc.pop('text') - header_index = doc.pop('header_index') - book.snippet_set.create( - sec=header_index, - text=text, - ) - - def delete_query(self, *queries): - """ - index.delete(queries=...) doesn't work, so let's reimplement it - using deletion of list of uids. - """ - uids = set() - for q in queries: - if isinstance(q, scorched.search.LuceneQuery): - q = self.index.query(q) - q.field_limiter.update(['uid']) - st = 0 - rows = 100 - while True: - ids = q.paginate(start=st, rows=rows).execute() - if not len(ids): - break - for res in ids: - uids.add(res['uid']) - st += rows - if uids: - # FIXME: With Solr API change, this doesn't work. - #self.index.delete(uids) - return True - else: - return False - - def index_tags(self, *tags, **kw): - """ - Re-index global tag list. - Removes all tags from index, then index them again. - Indexed fields include: id, name (with and without polish stems), category - """ - log.debug("Indexing tags") - remove_only = kw.get('remove_only', False) - # first, remove tags from index. - if tags: - tag_qs = [] - for tag in tags: - q_id = self.index.Q(tag_id=tag.id) - - if isinstance(tag, PDCounterAuthor): - q_cat = self.index.Q(tag_category='pd_author') - elif isinstance(tag, PDCounterBook): - q_cat = self.index.Q(tag_category='pd_book') - else: - q_cat = self.index.Q(tag_category=tag.category) - - q_id_cat = self.index.Q(q_id & q_cat) - tag_qs.append(q_id_cat) - self.delete_query(*tag_qs) - else: # all - q = self.index.Q(tag_id__any=True) - self.delete_query(q) - - if not remove_only: - # then add them [all or just one passed] - if not tags: - tags = chain( - catalogue.models.Tag.objects.exclude(category='set'), - PDCounterAuthor.objects.all(), - PDCounterBook.objects.all()) - - for tag in tags: - if isinstance(tag, PDCounterAuthor): - doc = { - "tag_id": int(tag.id), - "tag_name": tag.name, - "tag_name_pl": tag.name, - "tag_category": 'pd_author', - "is_pdcounter": True, - "uid": "tag%d_pd_a" % tag.id - } - elif isinstance(tag, PDCounterBook): - doc = { - "tag_id": int(tag.id), - "tag_name": tag.title, - "tag_name_pl": tag.title, - "tag_category": 'pd_book', - "is_pdcounter": True, - "uid": "tag%d_pd_b" % tag.id - } - else: - doc = { - "tag_id": int(tag.id), - "tag_name": tag.name, - "tag_name_pl": tag.name, - "tag_category": tag.category, - "is_pdcounter": False, - "uid": "tag%d" % tag.id - } - self.index.add(doc) - - def create_book_doc(self, book): - """ - Create a lucene document referring book id. - """ - doc = {'book_id': int(book.id)} - if book.parent is not None: - doc['parent_id'] = int(book.parent.id) - return doc - - def remove_book(self, book, remove_snippets=True, legacy=True): - """Removes a book from search index. - book - Book instance.""" - if legacy: - self.delete_query(self.index.Q(book_id=book.id)) - - if remove_snippets: - snippets = Snippets(book.id) - snippets.remove() - self.remove_snippets(book) - - def index_book(self, book, book_info=None, overwrite=True, legacy=True): - """ - Indexes the book. - Creates a lucene document for extracted metadata - and calls self.index_content() to index the contents of the book. - """ - if not book.xml_file: return - - if overwrite: - # we don't remove snippets, since they might be still needed by - # threads using not reopened index - self.remove_book(book, remove_snippets=False, legacy=legacy) - - book_doc = self.create_book_doc(book) - meta_fields = self.extract_metadata(book, book_info, dc_only=[ - 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres']) - # let's not index it - it's only used for extracting publish date - if 'source_name' in meta_fields: - del meta_fields['source_name'] - - for n, f in meta_fields.items(): - book_doc[n] = f - - book_doc['uid'] = "book%s" % book_doc['book_id'] - if legacy: - self.index.add(book_doc) - del book_doc - book_fields = { - 'title': meta_fields['title'], - 'authors': meta_fields['authors'], - 'published_date': meta_fields['published_date'] - } - - for tag_name in ('translators', 'epochs', 'kinds', 'genres'): - if tag_name in meta_fields: - book_fields[tag_name] = meta_fields[tag_name] - - self.index_content(book, book_fields=book_fields, legacy=legacy) - master_tags = [ 'opowiadanie', 'powiesc', @@ -307,7 +23,7 @@ class Index(SolrIndex): 'uwaga', 'extra', 'nota_red', 'abstrakt', 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu', 'didaskalia', - 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', + 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw' ] footnote_tags = ['pa', 'pt', 'pr', 'pe'] @@ -315,85 +31,41 @@ class Index(SolrIndex): skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'] - published_date_re = re.compile("([0-9]+)[\]. ]*$") - - def extract_metadata(self, book, book_info=None, dc_only=None): - """ - Extract metadata from book and returns a map of fields keyed by fieldname - """ - fields = {} - - if book_info is None: - book_info = dcparser.parse(open(book.xml_file.path, 'rb')) - - fields['slug'] = book.slug - fields['is_book'] = True - - # validator, name - for field in dcparser.BookInfo.FIELDS: - if dc_only and field.name not in dc_only: - continue - if hasattr(book_info, field.name): - if not getattr(book_info, field.name): - continue - type_indicator = field.value_type - if issubclass(type_indicator, librarian.meta.types.text.TextValue): - s = getattr(book_info, field.name) - if field.multiple: - s = ', '.join(s) - fields[field.name] = s - elif issubclass(type_indicator, librarian.meta.types.person.Person): - p = getattr(book_info, field.name) - if isinstance(p, librarian.meta.types.person.Person): - persons = str(p) - else: - persons = ', '.join(map(str, p)) - fields[field.name] = persons - - # get published date - pd = None - if hasattr(book_info, 'source_name') and book_info.source_name: - match = self.published_date_re.search(book_info.source_name) - if match is not None: - pd = str(match.groups()[0]) - if not pd: - pd = "" - fields["published_date"] = pd - - return fields - - # def add_gaps(self, fields, fieldname): - # """ - # Interposes a list of fields with gap-fields, which are indexed spaces and returns it. - # This allows for doing phrase queries which do not overlap the gaps (when slop is 0). - # """ - # def gap(): - # while True: - # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED) - # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1] - - def get_master(self, root): + @classmethod + def get_master(cls, root): """ Returns the first master tag from an etree. """ for master in root.iter(): - if master.tag in self.master_tags: + if master.tag in cls.master_tags: return master - def index_content(self, book, book_fields, legacy=True): + @staticmethod + def add_snippet(book, text, position): + book.snippet_set.create( + sec=position + 1, + text=text + ) + + @classmethod + def index_book(cls, book): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. """ + if not book.xml_file: return + + book.snippet_set.all().delete() + wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False) root = wld.edoc.getroot() - master = self.get_master(root) + master = cls.get_master(root) if master is None: return [] def walker(node): - if node.tag not in self.ignore_content_tags: + if node.tag not in cls.ignore_content_tags: yield node, None, None if node.text is not None: yield None, node.text, None @@ -407,627 +79,43 @@ class Index(SolrIndex): return def fix_format(text): - # separator = [" ", "\t", ".", ";", ","] if isinstance(text, list): - # need to join it first text = filter(lambda s: s is not None, content) text = ' '.join(text) - # for i in range(len(text)): - # if i > 0: - # if text[i][0] not in separator\ - # and text[i - 1][-1] not in separator: - # text.insert(i, " ") return re.sub("(?m)/$", "", text) - def add_part(snippets, **fields): - doc = self.create_book_doc(book) - for n, v in book_fields.items(): - doc[n] = v - - doc['header_index'] = fields["header_index"] - doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1 - doc['header_type'] = fields['header_type'] - - doc['text'] = fields['text'] - - # snippets - snip_pos = snippets.add(fields["text"]) - - doc['snippets_position'] = snip_pos[0] - doc['snippets_length'] = snip_pos[1] - if snippets.revision: - doc["snippets_revision"] = snippets.revision - - if 'fragment_anchor' in fields: - doc["fragment_anchor"] = fields['fragment_anchor'] - - if 'themes' in fields: - doc['themes'] = fields['themes'] - doc['uid'] = "part%s-%s-%s-%s" % ( - book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', '')) - return doc - - fragments = {} - snippets = Snippets(book.id).open('w') - try: - for header, position in zip(list(master), range(len(master))): - - if header.tag in self.skip_header_tags: - continue - if header.tag is etree.Comment: - continue - - # section content - content = [] - footnote = [] - - def all_content(text): - for frag in fragments.values(): - frag['text'].append(text) - content.append(text) - handle_text = [all_content] - - for start, text, end in walker(header): - # handle footnotes - if start is not None and start.tag in self.footnote_tags: - footnote = [] - - def collect_footnote(t): - footnote.append(t) - - handle_text.append(collect_footnote) - elif end is not None and footnote is not [] and end.tag in self.footnote_tags: - handle_text.pop() - doc = add_part(snippets, header_index=position, header_type=header.tag, - text=''.join(footnote)) - self.add_snippet(book, doc) - if legacy: - self.index.add(doc) - footnote = [] - - # handle fragments and themes. - if start is not None and start.tag == 'begin': - fid = start.attrib['id'][1:] - fragments[fid] = { - 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag} - - # themes for this fragment - elif start is not None and start.tag == 'motyw': - fid = start.attrib['id'][1:] - handle_text.append(lambda text: None) - if start.text is not None: - fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(',')))) - elif end is not None and end.tag == 'motyw': - handle_text.pop() - - elif start is not None and start.tag == 'end': - fid = start.attrib['id'][1:] - if fid not in fragments: - continue # a broken node, skip it - frag = fragments[fid] - if not frag['themes']: - continue # empty themes list. - del fragments[fid] - - doc = add_part(snippets, - header_type=frag['start_header'], - header_index=frag['start_section'], - header_span=position - frag['start_section'] + 1, - fragment_anchor=fid, - text=fix_format(frag['text']), - themes=frag['themes']) - # Add searchable fragment - self.add_snippet(book, doc) - if legacy: - self.index.add(doc) - - # Collect content. - - if text is not None and handle_text is not []: - hdl = handle_text[-1] - hdl(text) - - # in the end, add a section text. - doc = add_part(snippets, header_index=position, - header_type=header.tag, text=fix_format(content)) - - self.add_snippet(book, doc) - if legacy: - self.index.add(doc) - - finally: - snippets.close() - - def remove_picture(self, picture_or_id): - """Removes a picture from search index.""" - if isinstance(picture_or_id, picture.models.Picture): - picture_id = picture_or_id.id - else: - picture_id = picture_or_id - self.delete_query(self.index.Q(picture_id=picture_id)) - - def index_picture(self, picture, picture_info=None, overwrite=True): - """ - Indexes the picture. - Creates a lucene document for extracted metadata - and calls self.index_area() to index the contents of the picture. - """ - if overwrite: - # we don't remove snippets, since they might be still needed by - # threads using not reopened index - self.remove_picture(picture) - - picture_doc = {'picture_id': int(picture.id)} - meta_fields = self.extract_metadata(picture, picture_info, dc_only=[ - 'authors', 'title', 'epochs', 'kinds', 'genres']) - - picture_doc.update(meta_fields) - - picture_doc['uid'] = "picture%s" % picture_doc['picture_id'] - self.index.add(picture_doc) - del picture_doc['is_book'] - for area in picture.areas.all(): - self.index_area(area, picture_fields=picture_doc) - - def index_area(self, area, picture_fields): - """ - Indexes themes and objects on the area. - """ - doc = dict(picture_fields) - doc['area_id'] = area.id - doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True)) - doc['uid'] = 'area%s' % area.id - self.index.add(doc) - - -@total_ordering -class SearchResult(object): - def __init__(self, doc, how_found=None, query_terms=None): - self.boost = 1.0 - self._hits = [] - self._processed_hits = None # processed hits - self.snippets = [] - self.query_terms = query_terms - self._book = None - - if 'score' in doc: - self._score = doc['score'] - else: - self._score = 0 - - self.book_id = int(doc["book_id"]) - - try: - self.published_date = int(doc.get("published_date")) - except ValueError: - self.published_date = 0 - - # content hits - header_type = doc.get("header_type", None) - # we have a content hit in some header of fragment - if header_type is not None: - sec = (header_type, int(doc["header_index"])) - header_span = doc['header_span'] - header_span = header_span is not None and int(header_span) or 1 - fragment = doc.get("fragment_anchor", None) - snippets_pos = (doc['snippets_position'], doc['snippets_length']) - snippets_rev = doc.get('snippets_revision', None) - - hit = (sec + (header_span,), fragment, self._score, { - 'how_found': how_found, - 'snippets_pos': snippets_pos, - 'snippets_revision': snippets_rev, - 'themes': doc.get('themes', []), - 'themes_pl': doc.get('themes_pl', []) - }) - - self._hits.append(hit) - - @classmethod - def from_book(cls, book, how_found=None, query_terms=None): - doc = { - 'score': book.popularity.count, - 'book_id': book.id, - 'published_date': 0, - } - result = cls(doc, how_found=how_found, query_terms=query_terms) - result._book = book - return result - - def __str__(self): - return "" % \ - (self.book_id, len(self._hits), - len(self._processed_hits) if self._processed_hits else -1, - self._score, len(self.snippets)) - - def __bytes__(self): - return str(self).encode('utf-8') - - @property - def score(self): - return self._score * self.boost - - def merge(self, other): - if self.book_id != other.book_id: - raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id)) - self._hits += other._hits - self._score += max(other._score, 0) - return self - - def get_book(self): - if self._book is not None: - return self._book - try: - self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True) - except catalogue.models.Book.DoesNotExist: - self._book = None - return self._book - - book = property(get_book) - - POSITION = 0 - FRAGMENT = 1 - POSITION_INDEX = 1 - POSITION_SPAN = 2 - SCORE = 2 - OTHER = 3 - - @property - def hits(self): - if self._processed_hits is not None: - return self._processed_hits - - # to sections and fragments - frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits) - - sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None] - - # sections not covered by fragments - sect = filter(lambda s: 0 == len(list(filter( - lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < - f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) - - def remove_duplicates(lst, keyfn, larger): - els = {} - for e in lst: - eif = keyfn(e) - if eif in els: - if larger(els[eif], e): - continue - els[eif] = e - return els.values() - - # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE]) - - # remove duplicate sections - sections = {} - - for s in sect: - si = s[self.POSITION][self.POSITION_INDEX] - # skip existing - if si in sections: - if sections[si]['score'] >= s[self.SCORE]: - continue - - m = {'score': s[self.SCORE], - 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1, - } - m.update(s[self.OTHER]) - sections[si] = m - - hits = list(sections.values()) - - for f in frags: - try: - frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id) - except catalogue.models.Fragment.DoesNotExist: - # stale index + for position, header in enumerate(master): + if header.tag in cls.skip_header_tags: continue - # Figure out if we were searching for a token matching some word in theme name. - themes = frag.tags.filter(category='theme') - themes_hit = set() - if self.query_terms is not None: - for i in range(0, len(f[self.OTHER]['themes'])): - tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ') - tms = map(str.lower, tms) - for qt in self.query_terms: - if qt in tms: - themes_hit.add(f[self.OTHER]['themes'][i]) - break - - def theme_by_name(n): - th = list(filter(lambda t: t.name == n, themes)) - if th: - return th[0] - else: - return None - themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit))) - - m = {'score': f[self.SCORE], - 'fragment': frag, - 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1, - 'themes': themes, - 'themes_hit': themes_hit - } - m.update(f[self.OTHER]) - hits.append(m) - - hits.sort(key=lambda h: h['score'], reverse=True) - - self._processed_hits = hits - - return hits - - @staticmethod - def aggregate(*result_lists): - books = {} - for rl in result_lists: - for r in rl: - if r.book_id in books: - books[r.book_id].merge(r) - else: - books[r.book_id] = r - return books.values() - - def get_sort_key(self): - return (-self.score, - self.published_date, - self.book.sort_key_author if self.book else '', - self.book.sort_key if self.book else '') - - def __lt__(self, other): - return self.get_sort_key() > other.get_sort_key() - - def __eq__(self, other): - return self.get_sort_key() == other.get_sort_key() - - def __len__(self): - return len(self.hits) - - def snippet_pos(self, idx=0): - return self.hits[idx]['snippets_pos'] - - def snippet_revision(self, idx=0): - try: - return self.hits[idx]['snippets_revision'] - except (IndexError, KeyError): - return None - - -@total_ordering -class PictureResult(object): - def __init__(self, doc, how_found=None, query_terms=None): - self.boost = 1.0 - self.query_terms = query_terms - self._picture = None - self._hits = [] - self._processed_hits = None - - if 'score' in doc: - self._score = doc['score'] - else: - self._score = 0 - - self.picture_id = int(doc["picture_id"]) - - if doc.get('area_id'): - hit = (self._score, { - 'how_found': how_found, - 'area_id': doc['area_id'], - 'themes': doc.get('themes', []), - 'themes_pl': doc.get('themes_pl', []), - }) - - self._hits.append(hit) - - def __str__(self): - return "" % (self.picture_id, self._score) - - def __repr__(self): - return str(self) - - @property - def score(self): - return self._score * self.boost - - def merge(self, other): - if self.picture_id != other.picture_id: - raise ValueError( - "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id)) - self._hits += other._hits - self._score += max(other._score, 0) - return self - - SCORE = 0 - OTHER = 1 - - @property - def hits(self): - if self._processed_hits is not None: - return self._processed_hits - - hits = [] - for hit in self._hits: - try: - area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id']) - except picture.models.PictureArea.DoesNotExist: - # stale index + if header.tag is etree.Comment: continue - # Figure out if we were searching for a token matching some word in theme name. - themes_hit = set() - if self.query_terms is not None: - for i in range(0, len(hit[self.OTHER]['themes'])): - tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ') - tms = map(str.lower, tms) - for qt in self.query_terms: - if qt in tms: - themes_hit.add(hit[self.OTHER]['themes'][i]) - break - m = { - 'score': hit[self.SCORE], - 'area': area, - 'themes_hit': themes_hit, - } - m.update(hit[self.OTHER]) - hits.append(m) + # section content + content = [] + footnote = [] - hits.sort(key=lambda h: h['score'], reverse=True) - hits = hits[:1] - self._processed_hits = hits - return hits + def all_content(text): + content.append(text) + handle_text = [all_content] - def get_picture(self): - if self._picture is None: - self._picture = picture.models.Picture.objects.get(id=self.picture_id) - return self._picture - - picture = property(get_picture) - - @staticmethod - def aggregate(*result_lists): - books = {} - for rl in result_lists: - for r in rl: - if r.picture_id in books: - books[r.picture_id].merge(r) - else: - books[r.picture_id] = r - return books.values() + for start, text, end in walker(header): + # handle footnotes + if start is not None and start.tag in cls.footnote_tags: + footnote = [] - def __lt__(self, other): - return self.score < other.score + def collect_footnote(t): + footnote.append(t) - def __eq__(self, other): - return self.score == other.score - - -class Search(SolrIndex): - """ - Search facilities. - """ - def __init__(self, default_field="text"): - super(Search, self).__init__(mode='r') - - def make_term_query(self, query, field='text', modal=operator.or_): - """ - Returns term queries joined by boolean query. - modal - applies to boolean query - fuzzy - should the query by fuzzy. - """ - if query is None: - query = '' - q = self.index.Q() - q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q) - - return q - - def search_by_author(self, words): - from catalogue.models import Book - books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count') - for word in words: - books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count') - return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]] - - def search_words(self, words, fields, required=None, book=True, picture=False): - if book and not picture and fields == ['authors']: - return self.search_by_author(words) - filters = [] - for word in words: - if book or picture or (word not in stopwords): - word_filter = None - for field in fields: - q = self.index.Q(**{field: word}) - if word_filter is None: - word_filter = q - else: - word_filter |= q - filters.append(word_filter) - if required: - required_filter = None - for field in required: - for word in words: - if book or picture or (word not in stopwords): - q = self.index.Q(**{field: word}) - if required_filter is None: - required_filter = q - else: - required_filter |= q - filters.append(required_filter) - if not filters: - return [] - params = {} - if book: - params['is_book'] = True - if picture: - params['picture_id__gt'] = 0 - else: - params['book_id__gt'] = 0 - query = self.index.query(**params) - query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True) - result_class = PictureResult if picture else SearchResult - return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()] - - def get_snippets(self, searchresult, query, field='text', num=1): - """ - Returns a snippet for found scoreDoc. - """ - maxnum = len(searchresult) - if num is None or num < 0 or num > maxnum: - num = maxnum - book_id = searchresult.book_id - revision = searchresult.snippet_revision() - snippets = Snippets(book_id, revision=revision) - snips = [None] * maxnum - try: - snippets.open() - idx = 0 - while idx < maxnum and num > 0: - position, length = searchresult.snippet_pos(idx) - if position is None or length is None: - continue - text = snippets.get((int(position), - int(length))) - snip = self.index.highlight(text=text, field=field, q=query) - if not snip and field == 'text': - snip = self.index.highlight(text=text, field='text_nonstem', q=query) - if snip not in snips: - snips[idx] = snip - if snip: - num -= 1 - idx += 1 - - except IOError as e: - book = catalogue.models.Book.objects.filter(id=book_id, findable=True) - if not book: - log.error("Book does not exist for book id = %d" % book_id) - elif not book.get().children.exists(): - log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e)) - return [] - finally: - snippets.close() - - # remove verse end markers.. - snips = [s.replace("/\n", "\n") if s else s for s in snips] - - searchresult.snippets = snips - - return snips - - @staticmethod - def apply_filters(query, filters): - """ - Apply filters to a query - """ - if filters is None: - filters = [] - filters = filter(lambda x: x is not None, filters) - for f in filters: - query = query.query(f) - return query + handle_text.append(collect_footnote) + elif end is not None and footnote is not [] and end.tag in cls.footnote_tags: + handle_text.pop() + cls.add_snippet(book, ''.join(footnote), position) + footnote = [] + if text is not None and handle_text is not []: + hdl = handle_text[-1] + hdl(text) -if getattr(settings, 'SEARCH_MOCK', False): - from .mock_search import Search + # in the end, add a section text. + cls.add_snippet(book, fix_format(content), position) diff --git a/src/search/management/__init__.py b/src/search/management/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/search/management/commands/__init__.py b/src/search/management/commands/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/search/management/commands/reindex.py b/src/search/management/commands/reindex.py deleted file mode 100644 index c2fe78e94..000000000 --- a/src/search/management/commands/reindex.py +++ /dev/null @@ -1,105 +0,0 @@ -# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -import sys -import traceback - -from django.core.management.base import BaseCommand - - -def query_yes_no(question, default="yes"): - """Ask a yes/no question via raw_input() and return their answer. - - "question" is a string that is presented to the user. - "default" is the presumed answer if the user just hits . - It must be "yes" (the default), "no" or None (meaning - an answer is required of the user). - - The "answer" return value is one of "yes" or "no". - """ - valid = {"yes": True, "y": True, "ye": True, - "no": False, "n": False} - if default is None: - prompt = " [y/n] " - elif default == "yes": - prompt = " [Y/n] " - elif default == "no": - prompt = " [y/N] " - else: - raise ValueError("invalid default answer: '%s'" % default) - - while True: - sys.stdout.write(question + prompt) - choice = raw_input().lower() - if default is not None and choice == '': - return valid[default] - elif choice in valid: - return valid[choice] - else: - sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") - - -class Command(BaseCommand): - help = 'Reindex everything.' - - def add_arguments(self, parser): - parser.add_argument( - '-n', '--book-id', action='store_true', dest='book_id', - default=False, help='book id instead of slugs') - parser.add_argument( - '-t', '--just-tags', action='store_true', dest='just_tags', - default=False, help='just reindex tags') - parser.add_argument( - '--start', dest='start_from', default=None, - help='start from this slug') - parser.add_argument( - '--stop', dest='stop_after', default=None, - help='stop after this slug') - parser.add_argument('args', nargs='*', metavar='slug/id') - - def handle(self, **opts): - from catalogue.models import Book - from search.index import Index - idx = Index() - - if not opts['just_tags']: - if opts['args']: - books = [] - for a in opts['args']: - if opts['book_id']: - books += Book.objects.filter(id=int(a)).all() - else: - books += Book.objects.filter(slug=a).all() - else: - books = list(Book.objects.order_by('slug')) - start_from = opts.get('start_from') - stop_after = opts.get('stop_after') - if start_from: - start_from = start_from.replace('-', '') - if stop_after: - stop_after = stop_after.replace('-', '') - while books: - try: - b = books[0] - slug = b.slug.replace('-', '') - if stop_after and slug > stop_after: - break - if not start_from or slug >= start_from: - print(b.slug) - idx.index_book(b) - idx.index.commit() - books.pop(0) - except: - traceback.print_exc() - try: - # we might not be able to rollback - idx.index.rollback() - except: - pass - retry = query_yes_no("Retry?") - if not retry: - break - - print('Reindexing tags.') - idx.index_tags() - idx.index.commit() diff --git a/src/search/management/commands/reindex_pictures.py b/src/search/management/commands/reindex_pictures.py deleted file mode 100644 index 8505189a1..000000000 --- a/src/search/management/commands/reindex_pictures.py +++ /dev/null @@ -1,81 +0,0 @@ -# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -import sys -import traceback - -from django.core.management.base import BaseCommand - - -def query_yes_no(question, default="yes"): - """Ask a yes/no question via raw_input() and return their answer. - - "question" is a string that is presented to the user. - "default" is the presumed answer if the user just hits . - It must be "yes" (the default), "no" or None (meaning - an answer is required of the user). - - The "answer" return value is one of "yes" or "no". - """ - valid = {"yes": True, "y": True, "ye": True, - "no": False, "n": False} - if default is None: - prompt = " [y/n] " - elif default == "yes": - prompt = " [Y/n] " - elif default == "no": - prompt = " [y/N] " - else: - raise ValueError("invalid default answer: '%s'" % default) - - while True: - sys.stdout.write(question + prompt) - choice = raw_input().lower() - if default is not None and choice == '': - return valid[default] - elif choice in valid: - return valid[choice] - else: - sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") - - -class Command(BaseCommand): - help = 'Reindex pictures.' - - def add_arguments(self, parser): - self.add_argument( - '-n', '--picture-id', action='store_true', dest='picture_id', - default=False, help='picture id instead of slugs') - self.add_argument('slug/id', nargs='*', metavar='slug/id') - - def handle(self, **opts): - from picture.models import Picture - from search.index import Index - idx = Index() - - if opts['args']: - pictures = [] - for a in opts['args']: - if opts['picture_id']: - pictures += Picture.objects.filter(id=int(a)).all() - else: - pictures += Picture.objects.filter(slug=a).all() - else: - pictures = list(Picture.objects.order_by('slug')) - while pictures: - try: - p = pictures[0] - print(p.slug) - idx.index_picture(p) - idx.index.commit() - pictures.pop(0) - except: - traceback.print_exc() - try: - # we might not be able to rollback - idx.index.rollback() - except: - pass - retry = query_yes_no("Retry?") - if not retry: - break diff --git a/src/search/management/commands/snippets.py b/src/search/management/commands/snippets.py deleted file mode 100644 index 62512c94b..000000000 --- a/src/search/management/commands/snippets.py +++ /dev/null @@ -1,23 +0,0 @@ -# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -from glob import glob -from os import path -from django.conf import settings -from django.core.management.base import BaseCommand - - -class Command(BaseCommand): - help = 'Check snippets.' - - def handle(self, *args, **opts): - sfn = glob(settings.SEARCH_INDEX+'snippets/*') - for fn in sfn: - print(fn) - bkid = path.basename(fn) - with open(fn) as f: - cont = f.read() - try: - cont.decode('utf-8') - except UnicodeDecodeError: - print("error in snippets %s" % bkid) diff --git a/src/search/mock_search.py b/src/search/mock_search.py deleted file mode 100644 index 33d2a5eaf..000000000 --- a/src/search/mock_search.py +++ /dev/null @@ -1,40 +0,0 @@ -# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. -# -from unittest.mock import Mock -from catalogue.models import Book, Tag -from random import randint, choice - - -class Search(Mock): - """ - Search mock for development without setting up Solr. - - Instead of connecting to an actual search server, it returns - some random results for any query. - """ - class MockIndex(Mock): - def analyze(*args, **kwargs): - return [] - - index = MockIndex() - - def search_words(self, words, fields, required=None, book=True, picture=False): - from .index import SearchResult - - max_results = 20 - - if picture: return [] - - qs = Book.objects.filter(findable=True).order_by('?') - results = [] - for book in qs[:randint(1, max_results)]: - doc = { - 'score': randint(0, 100), - 'book_id': book.pk, - 'published_date': randint(1000, 1920), - } - res = SearchResult(doc, how_found='mock', query_terms=words) - results.append(res) - return results - diff --git a/src/search/tests/index.py b/src/search/tests/index.py index 34d958603..d63bafbf7 100644 --- a/src/search/tests/index.py +++ b/src/search/tests/index.py @@ -1,40 +1,27 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from unittest import skipIf from django.conf import settings from django.test.utils import override_settings from catalogue.test_utils import WLTestCase, get_fixture -import tempfile from catalogue.models import Book -from search.index import Index, Search import catalogue import opds -@override_settings(SEARCH_INDEX=tempfile.mkdtemp(prefix='djangotest_search_')) -@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False), - 'Requires search server and NO_SEARCH_INDEX=False.') class BookSearchTests(WLTestCase): def setUp(self): WLTestCase.setUp(self) - index = Index() - self.search = Search() - index.delete_query(self.search.index.query(uid="*")) - index.index.commit() + with override_settings(NO_SEARCH_INDEX=False): + self.do_doktora = Book.from_xml_file( + get_fixture('do-doktora.xml', opds)) + self.do_anusie = Book.from_xml_file( + get_fixture('fraszka-do-anusie.xml', catalogue)) - self.do_doktora = Book.from_xml_file( - get_fixture('do-doktora.xml', opds)) - self.do_anusie = Book.from_xml_file( - get_fixture('fraszka-do-anusie.xml', catalogue)) - - # TODO: Add slop option to sunburnt - # def test_search_perfect_parts(self): - # books = self.search.search_phrase("Jakoż hamować") - # assert len(books) == 2 - # for b in books: - # b.book_id == self.book.id - # a = SearchResult.aggregate(books) - # # just one fragment hit. - # assert len(a[0].hits) == 1 + def test_search_perfect_parts(self): + response = self.client.get('/szukaj/?q=Jakoż hamować') + res = response.context['results'] + self.assertEqual(len(res['snippet']), 1) + for b, s in res['snippet'].items(): + self.assertEqual(b.id, self.do_anusie.id) diff --git a/src/search/urls.py b/src/search/urls.py index 46e73c5bc..1e66d3314 100644 --- a/src/search/urls.py +++ b/src/search/urls.py @@ -6,6 +6,6 @@ from . import views urlpatterns = [ - path('', views.main, name='wlsearch'), + path('', views.search, name='wlsearch'), path('hint/', views.hint, name='search_hint'), ] diff --git a/src/search/utils.py b/src/search/utils.py index 6c0acf594..77ff1ae11 100644 --- a/src/search/utils.py +++ b/src/search/utils.py @@ -1,3 +1,4 @@ +from django.conf import settings from django.db.models import Func from django.contrib.postgres.search import SearchQuery, SearchVectorField @@ -8,7 +9,8 @@ class UnaccentSearchQuery(SearchQuery): ''' def as_sql(self, *args, **kwargs): sql, params = super().as_sql(*args, **kwargs) - sql = f'unaccent({sql}::text)::tsquery' + if settings.SEARCH_USE_UNACCENT: + sql = f'unaccent({sql}::text)::tsquery' return sql, params @@ -19,10 +21,11 @@ class UnaccentSearchVector(Func): But user enters 'roze' -> stem leaves it as is, so we need original form in the vector. ''' function='to_tsvector' - template = '''unaccent( - %(function)s('polish', %(expressions)s)::text)::tsvector || - to_tsvector( - 'polish_simple', - unaccent(%(expressions)s) - )''' + if settings.SEARCH_USE_UNACCENT: + template = f'''unaccent( + %(function)s('{settings.SEARCH_CONFIG}', %(expressions)s)::text)::tsvector || + to_tsvector( + '{settings.SEARCH_CONFIG_SIMPLE}', + unaccent(%(expressions)s) + )''' output_field = SearchVectorField() diff --git a/src/search/views.py b/src/search/views.py index b5cc0baa3..e5ea59837 100644 --- a/src/search/views.py +++ b/src/search/views.py @@ -2,30 +2,18 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from django.conf import settings -from django.http.response import HttpResponseRedirect from django.shortcuts import render from django.views.decorators import cache from django.http import HttpResponse, JsonResponse from catalogue.models import Book, Tag -from pdcounter.models import Author -from picture.models import Picture -from search.index import Search, SearchResult, PictureResult from .forms import SearchFilters -from suggest.forms import PublishingSuggestForm import re import json from wolnelektury.utils import re_escape -def match_word_re(word): - if 'sqlite' in settings.DATABASES['default']['ENGINE']: - return r"\b%s\b" % word - elif 'mysql' in settings.DATABASES['default']['ENGINE']: - return "[[:<:]]%s[[:>:]]" % word - - query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]") @@ -33,32 +21,6 @@ def remove_query_syntax_chars(query, replace=' '): return query_syntax_chars.sub(replace, query) -def did_you_mean(query, tokens): - return query - # change = {} - # for t in tokens: - # authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t)) - # if len(authors) > 0: - # continue - - # if False: - # if not dictionary.check(t): - # try: - # change_to = dictionary.suggest(t)[0].lower() - # if change_to != t.lower(): - # change[t] = change_to - # except IndexError: - # pass - - # if change == {}: - # return None - - # for frm, to in change.items(): - # query = query.replace(frm, to) - - # return query - - @cache.never_cache def hint(request, mozhint=False, param='term'): prefix = request.GET.get(param, '') @@ -133,212 +95,3 @@ def search(request): ctx['hasresults'] = True break return render(request, 'search/results.html', ctx) - - -@cache.never_cache -def main(request): - if request.EXPERIMENTS['layout'].value: - return search(request) - - query = request.GET.get('q', '') - - format = request.GET.get('format') - lang = request.GET.get('lang') - epoch = request.GET.get('epoch') - kind = request.GET.get('kind') - genre = request.GET.get('genre') - - if len(query) < 2: - return render( - request, 'catalogue/search_too_short.html', - {'prefix': query}) - elif len(query) > 256: - return render( - request, 'catalogue/search_too_long.html', - {'prefix': query}) - - query = prepare_query(query) - if not (format or lang or epoch or kind or genre): - pd_authors = search_pd_authors(query) - else: - pd_authors = [] - if not format or format != 'obraz': - books = search_books( - query, - lang=lang, - only_audio=format=='audio', - only_synchro=format=='synchro', - epoch=epoch, - kind=kind, - genre=genre - ) - else: - books = [] - if (not format or format == 'obraz') and not lang: - pictures = search_pictures( - query, - epoch=epoch, - kind=kind, - genre=genre - ) - else: - pictures = [] - - suggestion = '' - - if not (books or pictures or pd_authors): - form = PublishingSuggestForm(initial={"books": query + ", "}) - return render( - request, - 'catalogue/search_no_hits.html', - { - 'form': form, - 'did_you_mean': suggestion - }) - - if not (books or pictures) and len(pd_authors) == 1: - return HttpResponseRedirect(pd_authors[0].get_absolute_url()) - - return render( - request, - 'catalogue/search_multiple_hits.html', - { - 'pd_authors': pd_authors, - 'books': books, - 'pictures': pictures, - 'did_you_mean': suggestion, - 'set': { - 'lang': lang, - 'format': format, - 'epoch': epoch, - 'kind': kind, - 'genre': genre, - }, - 'tags': { - 'epoch': Tag.objects.filter(category='epoch', for_books=True), - 'genre': Tag.objects.filter(category='genre', for_books=True), - 'kind': Tag.objects.filter(category='kind', for_books=True), - }, - }) - -def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None): - search = Search() - results_parts = [] - search_fields = [] - words = query.split() - fieldsets = ( - (['authors', 'authors_nonstem'], True), - (['title', 'title_nonstem'], True), - (['metadata', 'metadata_nonstem'], True), - (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False), - ) - for fields, is_book in fieldsets: - search_fields += fields - results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book)) - results = [] - ids_results = {} - for results_part in results_parts: - for result in sorted(SearchResult.aggregate(results_part), reverse=True): - book_id = result.book_id - if book_id in ids_results: - ids_results[book_id].merge(result) - else: - results.append(result) - ids_results[book_id] = result - descendant_ids = set( - Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True)) - results = [result for result in results if result.book_id not in descendant_ids] - for result in results: - search.get_snippets(result, query, num=3) - - def ensure_exists(r): - try: - if not r.book: - return False - except Book.DoesNotExist: - return False - - if lang and r.book.language != lang: - return False - if only_audio and not r.book.has_mp3_file(): - return False - if only_synchro and not r.book.has_daisy_file(): - return False - if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists(): - return False - if kind and not r.book.tags.filter(category='kind', slug=kind).exists(): - return False - if genre and not r.book.tags.filter(category='genre', slug=genre).exists(): - return False - - return True - - results = [r for r in results if ensure_exists(r)] - return results - - -def search_pictures(query, epoch=None, kind=None, genre=None): - search = Search() - results_parts = [] - search_fields = [] - words = query.split() - fieldsets = ( - (['authors', 'authors_nonstem'], True), - (['title', 'title_nonstem'], True), - (['metadata', 'metadata_nonstem'], True), - (['themes_pl', 'themes_pl_nonstem'], False), - ) - for fields, is_book in fieldsets: - search_fields += fields - results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True)) - results = [] - ids_results = {} - for results_part in results_parts: - for result in sorted(PictureResult.aggregate(results_part), reverse=True): - picture_id = result.picture_id - if picture_id in ids_results: - ids_results[picture_id].merge(result) - else: - results.append(result) - ids_results[picture_id] = result - - def ensure_exists(r): - try: - if not r.picture: - return False - except Picture.DoesNotExist: - return False - - if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists(): - return False - if kind and not r.picture.tags.filter(category='kind', slug=kind).exists(): - return False - if genre and not r.picture.tags.filter(category='genre', slug=genre).exists(): - return False - - return True - - results = [r for r in results if ensure_exists(r)] - return results - - -def search_pd_authors(query): - pd_authors = Author.objects.filter(name__icontains=query) - existing_slugs = Tag.objects.filter( - category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \ - .values_list('slug', flat=True) - pd_authors = pd_authors.exclude(slug__in=existing_slugs) - return pd_authors - - -def prepare_query(query): - query = ' '.join(query.split()) - # filter out private use characters - import unicodedata - query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co') - query = remove_query_syntax_chars(query) - - words = query.split() - if len(words) > 10: - query = ' '.join(words[:10]) - return query diff --git a/src/wolnelektury/settings/__init__.py b/src/wolnelektury/settings/__init__.py index 113c1e731..f772d3d56 100644 --- a/src/wolnelektury/settings/__init__.py +++ b/src/wolnelektury/settings/__init__.py @@ -29,15 +29,6 @@ except NameError: CELERY_TASK_ALWAYS_EAGER = True -# If SEARCH_INDEX not configured, disable the search. -try: - SOLR -except NameError: - NO_SEARCH_INDEX = True -else: - NO_SEARCH_INDEX = False - - try: SENTRY_DSN except NameError: diff --git a/src/wolnelektury/settings/basic.py b/src/wolnelektury/settings/basic.py index bbf684f35..413adbe7c 100644 --- a/src/wolnelektury/settings/basic.py +++ b/src/wolnelektury/settings/basic.py @@ -28,9 +28,6 @@ DATABASES = { DEFAULT_AUTO_FIELD = 'django.db.models.AutoField' -SOLR_TEST = "http://localhost:8983/solr/wl_test/" -SOLR_STOPWORDS = "/path/to/solr/data/conf/lang/stopwords_pl.txt" - # Local time zone for this installation. Choices can be found here: # http://en.wikipedia.org/wiki/List_of_tz_zones_by_name # although not all choices may be available on all operating systems. diff --git a/src/wolnelektury/settings/custom.py b/src/wolnelektury/settings/custom.py index f7fca4755..51a5613fd 100644 --- a/src/wolnelektury/settings/custom.py +++ b/src/wolnelektury/settings/custom.py @@ -68,6 +68,9 @@ CIVICRM_ACTIVITIES = { EXPERIMENTS_LAYOUT = 1 EXPERIMENTS_SOWKA = 0 -EXPERIMENTS_SEARCH = 0 WIDGETS = {} + +SEARCH_CONFIG = 'english' +SEARCH_CONFIG_SIMPLE = 'simple' +SEARCH_USE_UNACCENT = False diff --git a/src/wolnelektury/settings/static.py b/src/wolnelektury/settings/static.py index 179467d9c..97dcec468 100644 --- a/src/wolnelektury/settings/static.py +++ b/src/wolnelektury/settings/static.py @@ -8,7 +8,6 @@ from .paths import VAR_DIR # Example: "/home/media/media.lawrence.com/" MEDIA_ROOT = path.join(VAR_DIR, 'media/') STATIC_ROOT = path.join(VAR_DIR, 'static/') -SEARCH_INDEX = path.join(VAR_DIR, 'search_index/') # URL that handles the media served from MEDIA_ROOT. Make sure to use a # trailing slash if there is a path component (optional in other cases). diff --git a/src/wolnelektury/settings/test.py b/src/wolnelektury/settings/test.py index 57718b854..0e10be9fb 100644 --- a/src/wolnelektury/settings/test.py +++ b/src/wolnelektury/settings/test.py @@ -6,3 +6,7 @@ from wolnelektury.settings import * THUMBNAIL_BACKEND = 'wolnelektury.test_utils.DummyThumbnailBackend' CATALOGUE_GET_MP3_LENGTH = 'catalogue.test_utils.get_mp3_length' MEDIA_URL = '/media/' + +SEARCH_CONFIG = 'english' +SEARCH_CONFIG_SIMPLE = 'simple' +SEARCH_USE_UNACCENT = False -- 2.20.1