From 801a05d2ff33bb8a3c1a46ea0c657825b2787fa7 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Mon, 5 Jun 2023 13:01:27 +0200 Subject: [PATCH] Search --- .../0045_snippet_search_vector_idx.py | 19 +++ src/catalogue/models/snippet.py | 12 +- src/experiments/experiments.py | 9 +- src/pdcounter/models.py | 26 +++ src/search/forms.py | 152 +++++++++++------- src/search/templates/search/results.html | 52 ++++-- src/search/utils.py | 83 ++-------- src/search/views.py | 23 +++ src/wolnelektury/settings/custom.py | 3 +- .../2022/styles/components/_search.scss | 1 + 10 files changed, 239 insertions(+), 141 deletions(-) create mode 100644 src/catalogue/migrations/0045_snippet_search_vector_idx.py diff --git a/src/catalogue/migrations/0045_snippet_search_vector_idx.py b/src/catalogue/migrations/0045_snippet_search_vector_idx.py new file mode 100644 index 000000000..a89fdb7f3 --- /dev/null +++ b/src/catalogue/migrations/0045_snippet_search_vector_idx.py @@ -0,0 +1,19 @@ +# Generated by Django 4.0.8 on 2023-06-02 13:08 + +import django.contrib.postgres.indexes +from django.db import migrations +import django.db.models.expressions + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalogue', '0044_snippet'), + ] + + operations = [ + migrations.AddIndex( + model_name='snippet', + index=django.contrib.postgres.indexes.GinIndex(django.db.models.expressions.F('search_vector'), name='search_vector_idx'), + ), + ] diff --git a/src/catalogue/models/snippet.py b/src/catalogue/models/snippet.py index 3c9384cc7..4c25b8c97 100644 --- a/src/catalogue/models/snippet.py +++ b/src/catalogue/models/snippet.py @@ -1,23 +1,27 @@ from django.db import models +from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.search import SearchVector, SearchVectorField -from search.utils import build_search_vector +from search.utils import UnaccentSearchVector class Snippet(models.Model): book = models.ForeignKey('Book', models.CASCADE) sec = models.IntegerField() - # header_type ? - # header_span ? text = models.TextField() search_vector = SearchVectorField() + class Meta: + indexes = [ + GinIndex('search_vector', name='search_vector_idx'), + ] + def save(self, *args, **kwargs): super().save(*args, **kwargs) if not self.search_vector: self.update() def update(self): - self.search_vector = build_search_vector('text') + self.search_vector = UnaccentSearchVector('text') self.save() @classmethod diff --git a/src/experiments/experiments.py b/src/experiments/experiments.py index 15c103ff7..60d05213d 100644 --- a/src/experiments/experiments.py +++ b/src/experiments/experiments.py @@ -7,7 +7,7 @@ from .base import Experiment class NewLayout(Experiment): slug = 'layout' name = 'Nowy layout strony' - size = 1 or settings.EXPERIMENTS_LAYOUT + size = settings.EXPERIMENTS_LAYOUT def qualify(self, request): if get_language() != 'pl': @@ -21,7 +21,14 @@ class Sowka(Experiment): switchable = False +class Search(Experiment): + slug = 'search' + name = 'Nowa wyszukiwarka' + size = settings.EXPERIMENTS_SEARCH + + experiments = [ NewLayout, Sowka, + Search ] diff --git a/src/pdcounter/models.py b/src/pdcounter/models.py index 3ce7f72ce..2e1e0b90e 100644 --- a/src/pdcounter/models.py +++ b/src/pdcounter/models.py @@ -1,12 +1,14 @@ # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # +from django.apps import apps from django.conf import settings from django.db import models from django.urls import reverse from django.utils.translation import gettext_lazy as _ from datetime import datetime from django.db.models.signals import post_save, post_delete +from search.utils import UnaccentSearchVector class Author(models.Model): @@ -41,6 +43,18 @@ class Author(models.Model): has_description.short_description = _('description') has_description.boolean = True + @classmethod + def search(cls, query, qs=None): + Tag = apps.get_model('catalogue', 'Tag') + if qs is None: + qs = cls.objects.all() + pd_authors = qs.annotate(search_vector=UnaccentSearchVector('name')).filter(search_vector=query) + existing_slugs = Tag.objects.filter( + category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \ + .values_list('slug', flat=True) + pd_authors = pd_authors.exclude(slug__in=existing_slugs) + return pd_authors + def alive(self): return self.death is None @@ -72,6 +86,18 @@ class BookStub(models.Model): def __str__(self): return self.title + @classmethod + def search(cls, query, qs=None): + Book = apps.get_model('catalogue', 'Book') + if qs is None: + qs = cls.objects.all() + pd_books = qs.annotate(search_vector=UnaccentSearchVector('title')).filter(search_vector=query) + existing_slugs = Book.objects.filter( + slug__in=list(pd_books.values_list('slug', flat=True))) \ + .values_list('slug', flat=True) + pd_books = pd_books.exclude(slug__in=existing_slugs) + return pd_books + def get_absolute_url(self): return reverse('book_detail', args=[self.slug]) diff --git a/src/search/forms.py b/src/search/forms.py index 7efd7479d..f3bf0c024 100644 --- a/src/search/forms.py +++ b/src/search/forms.py @@ -2,12 +2,16 @@ # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from django.apps import apps +from django.conf import settings from django.contrib.postgres.search import SearchHeadline, SearchRank, SearchQuery from django import forms from django.utils.translation import gettext_lazy as _ - +from catalogue.constants import LANGUAGES_3TO2 +import catalogue.models +import pdcounter.models +import picture.models from .fields import JQueryAutoCompleteSearchField, InlineRadioWidget -from .utils import build_search_query +from .utils import UnaccentSearchQuery, UnaccentSearchVector class SearchForm(forms.Form): @@ -25,7 +29,10 @@ class SearchForm(forms.Form): class SearchFilters(forms.Form): - q = forms.CharField(required=False, widget=forms.HiddenInput()) + q = forms.CharField( + required=False, widget=forms.HiddenInput(), + min_length=2, max_length=256, + ) format = forms.ChoiceField(required=False, choices=[ ('', 'wszystkie'), ('text', 'tekst'), @@ -51,89 +58,98 @@ class SearchFilters(forms.Form): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - from catalogue.models import Book, Tag + langs = dict(settings.LANGUAGES) self.fields['lang'].choices = [('', 'wszystkie')] + [ - (b, b) - for b in Book.objects.values_list( + ( + b, + langs.get(LANGUAGES_3TO2.get(b, b), b) + ) + for b in catalogue.models.Book.objects.values_list( 'language', flat=True ).distinct().order_by() ] self.fields['epoch'].choices = [('', 'wszystkie')] + [ (b.slug, b.name) - for b in Tag.objects.filter(category='epoch') + for b in catalogue.models.Tag.objects.filter(category='epoch') ] self.fields['genre'].choices = [('', 'wszystkie')] + [ (b.slug, b.name) - for b in Tag.objects.filter(category='genre') + for b in catalogue.models.Tag.objects.filter(category='genre') ] def get_querysets(self): - Tag = apps.get_model('catalogue', 'Tag') - Book = apps.get_model('catalogue', 'Book') - Picture = apps.get_model('picture', 'Picture') - Snippet = apps.get_model('catalogue', 'Snippet') - Collection = apps.get_model('catalogue', 'Collection') qs = { - 'author': Tag.objects.filter(category='author'), - 'theme': Tag.objects.filter(category='theme'), - 'genre': Tag.objects.filter(category='genre'), - 'collection': Collection.objects.all(), - 'book': Book.objects.all(), #findable - 'snippet': Snippet.objects.all(), - 'art': Picture.objects.all(), + 'author': catalogue.models.Tag.objects.filter(category='author'), + 'pdauthor': pdcounter.models.Author.objects.all(), + 'theme': catalogue.models.Tag.objects.filter(category='theme'), + 'genre': catalogue.models.Tag.objects.filter(category='genre'), + 'collection': catalogue.models.Collection.objects.all(), + 'book': catalogue.models.Book.objects.all(), #findable + 'pdbook': pdcounter.models.BookStub.objects.all(), + 'snippet': catalogue.models.Snippet.objects.all(), + 'art': picture.models.Picture.objects.all(), # art pieces - # pdbooks - # pdauthors } if self.cleaned_data['category']: c = self.cleaned_data['category'] - if c != 'author': qs['author'] = Tag.objects.none() - if c != 'theme': qs['theme'] = Tag.objects.none() - if c != 'genre': qs['genre'] = Tag.objects.none() - if c != 'collection': qs['collection'] = Collection.objects.none() - if c != 'book': qs['book'] = Book.objects.none() - if c != 'quote': qs['snippet'] = Snippet.objects.none() - if c != 'art': qs['art'] = Picture.objects.none() + if c != 'author': + qs['author'] = qs['author'].none() + qs['pdauthor'] = qs['pdauthor'].none() + if c != 'theme': qs['theme'] = qs['theme'].none() + if c != 'genre': qs['genre'] = qs['genre'].none() + if c != 'collection': qs['collection'] = qs['collection'].none() + if c != 'book': + qs['book'] = qs['book'].none() + qs['pdbook'] = qs['pdbook'].none() + if c != 'quote': qs['snippet'] = qs['snippet'].none() + if c != 'art': qs['art'] = qs['art'].none() qs['art'] = Picture.objects.none() if self.cleaned_data['format']: c = self.cleaned_data['format'] - qs['author'] = Tag.objects.none() - qs['theme'] = Tag.objects.none() - qs['genre'] = Tag.objects.none() - qs['collection'] = Collection.objects.none() + qs['author'] = qs['author'].none() + qs['pdauthor'] = qs['pdauthor'].none() + qs['theme'] = qs['theme'].none() + qs['genre'] = qs['genrer'].none() + qs['collection'] = qs['collection'].none() if c == 'art': - qs['book'] = Book.objects.none() - qs['snippet'] = Snippet.objects.none() + qs['book'] = qs['book'].none() + qs['pdbook'] = qs['pdbook'].none() + qs['snippet'] = qs['snippet'].none() if c in ('text', 'audio', 'daisy'): - qs['art'] = Picture.objects.none() + qs['art'] = qs['art'].none() if c == 'audio': qs['book'] = qs['book'].filter(media__type='mp3') + qs['pdbook'] = qs['book'].none() qs['snippet'] = qs['snippet'].filter(book__media__type='mp3') elif c == 'daisy': qs['book'] = qs['book'].filter(media__type='daisy') qs['snippet'] = qs['snippet'].filter(book__media__type='daisy') if self.cleaned_data['lang']: - qs['author'] = Tag.objects.none() - qs['theme'] = Tag.objects.none() - qs['genre'] = Tag.objects.none() - qs['art'] = Picture.objects.none() - qs['collection'] = Collection.objects.none() + qs['author'] = qs['author'].none() + qs['pdauthor'] = qs['pdauthor'].none() + qs['theme'] = qs['theme'].none() + qs['genre'] = qs['genre'].none() + qs['art'] = qs['art'].none() + qs['collection'] = qs['collection'].none() qs['book'] = qs['book'].filter(language=self.cleaned_data['lang']) + qs['pdbook'] = qs['pdbook'].none() qs['snippet'] = qs['snippet'].filter(book__language=self.cleaned_data['lang']) for tag_cat in ('epoch', 'genre'): c = self.cleaned_data[tag_cat] if c: # FIXME nonexistent - t = Tag.objects.get(category=tag_cat, slug=c) - qs['author'] = Tag.objects.none() - qs['theme'] = Tag.objects.none() - qs['genre'] = Tag.objects.none() - qs['collection'] = Collection.objects.none() + t = catalogue.models.Tag.objects.get(category=tag_cat, slug=c) + qs['author'] = qs['author'].none() + qs['pdauthor'] = qs['pdauthor'].none() + qs['theme'] = qs['theme'].none() + qs['genre'] = qs['genre'].none() + qs['collection'] = qs['collection'].none() qs['book'] = qs['book'].filter(tag_relations__tag=t) + qs['pdbook'] = qs['pdbook'].none() qs['snippet'] = qs['snippet'].filter(book__tag_relations__tag=t) qs['art'] = qs['art'].filter(tag_relations__tag=t) @@ -142,17 +158,14 @@ class SearchFilters(forms.Form): def results(self): qs = self.get_querysets() query = self.cleaned_data['q'] - squery = build_search_query(query, config='polish') + squery = UnaccentSearchQuery(query, config='polish') query = SearchQuery(query, config='polish') - books = qs['book'].filter(title__search=query) + books = qs['book'].annotate( + search_vector=UnaccentSearchVector('title') + ).filter(search_vector=squery) books = books.exclude(ancestor__in=books) - return { - 'author': qs['author'].filter(slug__search=query), - 'theme': qs['theme'].filter(slug__search=query), - 'genre': qs['genre'].filter(slug__search=query), - 'collection': qs['collection'].filter(title__search=query), - 'book': books[:100], - 'snippet': qs['snippet'].annotate( + + snippets = qs['snippet'].annotate( rank=SearchRank('search_vector', squery) ).filter(rank__gt=0).order_by('-rank').annotate( headline=SearchHeadline( @@ -163,7 +176,32 @@ class SearchFilters(forms.Form): stop_sel='', highlight_all=True ) - )[:100], - 'art': qs['art'].filter(title__search=query)[:100], + )[:100] + snippets_by_book = {} + for snippet in snippets: + snippet_list = snippets_by_book.setdefault(snippet.book, []) + if len(snippet_list) < 3: + snippet_list.append(snippet) + + return { + 'author': qs['author'].annotate( + search_vector=UnaccentSearchVector('name_pl') + ).filter(search_vector=squery), + 'theme': qs['theme'].annotate( + search_vector=UnaccentSearchVector('name_pl') + ).filter(search_vector=squery), + 'genre': qs['genre'].annotate( + search_vector=UnaccentSearchVector('name_pl') + ).filter(search_vector=squery), + 'collection': qs['collection'].annotate( + search_vector=UnaccentSearchVector('title') + ).filter(search_vector=squery), + 'book': books[:100], + 'art': qs['art'].annotate( + search_vector=UnaccentSearchVector('title') + ).filter(search_vector=squery)[:100], + 'snippet': snippets_by_book, + 'pdauthor': pdcounter.models.Author.search(squery, qs=qs['pdauthor']), + 'pdbook': pdcounter.models.BookStub.search(squery, qs=qs['pdbook']), } diff --git a/src/search/templates/search/results.html b/src/search/templates/search/results.html index c6ea3836d..189d095e2 100644 --- a/src/search/templates/search/results.html +++ b/src/search/templates/search/results.html @@ -35,6 +35,12 @@ + {% if not hasresults %} +

+ Brak wyników. +

+ {% endif %} + {% if results.author %}

Autorzy

@@ -101,20 +107,22 @@
{% endif %} - {% if results.fragment or results.snippet %} + {% if results.snippet %}

W treści

- {% for f in results.snippet %} + {% for book, snippets in results.snippet.items %}
- {% for author in f.book.authors %} + {% for author in book.authors %} {{ author }} {% endfor %} - - {{ f.book.title }} - - - {{ f.headline|safe }} + + {{ book.title }} + {% for f in snippets %} + + {{ f.headline|safe }} + + {% endfor %}
{% endfor %}
@@ -132,7 +140,7 @@ {% endif %} - {% if pd_authors %} + {% if results.pdauthor or results.pdbook %}

Domena publiczna?

@@ -142,10 +150,32 @@ Dowiedz się, dlaczego biblioteki internetowe nie mogą udostępniać dzieł tego autora.

- {% for tag in pd_authors %} + {% for tag in results.pdauthor %}
{{ tag }} - Dzieła tego autora będą mogły być publikowane bez ograniczeń w roku {{ tag.goes_to_pd }}. + {% if tag.death %} + {% if tag.in_pd %} + Dzieła tego autora są w domenie publicznej i czekają na publikację. + {% else %} + Dzieła tego autora będą mogły być publikowane bez ograniczeń w roku {{ tag.goes_to_pd }}. + {% endif %} + {% else %} + Dzieła tego autora są objęte prawem autorskim. + {% endif %} +
+ {% endfor %} + {% for book in results.pdbook %} +
+ {{ book }} + {% if book.pd %} + {% if book.in_pd %} + Ten utwór jest w domenie publicznej i czeka na publikację. + {% else %} + Ten utwór będzie mógł być publikowany bez ograniczeń w roku {{ book.pd }}. + {% endif %} + {% else %} + Ten utwór nie jest jeszcze w domenie publicznej. + {% endif %}
{% endfor %}
diff --git a/src/search/utils.py b/src/search/utils.py index b2cbe8d94..6c0acf594 100644 --- a/src/search/utils.py +++ b/src/search/utils.py @@ -1,29 +1,23 @@ from django.db.models import Func -from django.contrib.postgres.search import SearchVector, SearchQuery, SearchQueryField, SearchHeadline as SH +from django.contrib.postgres.search import SearchQuery, SearchVectorField +class UnaccentSearchQuery(SearchQuery): + ''' + The idea is to run unaccent *after* the query is already passed through the language dictionary. + ''' + def as_sql(self, *args, **kwargs): + sql, params = super().as_sql(*args, **kwargs) + sql = f'unaccent({sql}::text)::tsquery' + return sql, params -class UnaccentTSVector(Func): - function = 'UNACCENT' - template = '%(function)s(%(expressions)s::text)::tsvector' - -class Unaccent(Func): - function = 'UNACCENT' - - -class ConcatTSVector(Func): - function = 'CONCAT' - template = '%(function)s(%(expressions)s)::tsvector' - - -class UnaccentTSQuery(Func): - function = 'UNACCENT' - template = '%(function)s(%(expressions)s::text)::tsquery' - output_field = SearchQueryField() - - -class TSV(Func): +class UnaccentSearchVector(Func): + ''' + We do the indexing twice, to account for non-diacritic versions. + For example: user enters 'róże' -> stem to 'róża' -> unaccent to 'roza'. + But user enters 'roze' -> stem leaves it as is, so we need original form in the vector. + ''' function='to_tsvector' template = '''unaccent( %(function)s('polish', %(expressions)s)::text)::tsvector || @@ -31,49 +25,4 @@ class TSV(Func): 'polish_simple', unaccent(%(expressions)s) )''' - - -def build_search_vector(*fields): - return TSV(*fields) - - -def build_search_query(*fields, **kwargs): - return UnaccentTSQuery(SearchQuery(*fields, **kwargs)) - - - -class SearchHeadline(SH): - - def __init__( - self, - expression, - query, - *, - config=None, - start_sel=None, - stop_sel=None, - max_words=None, - min_words=None, - short_word=None, - highlight_all=None, - max_fragments=None, - fragment_delimiter=None, - ): - options = { - "StartSel": start_sel, - "StopSel": stop_sel, - "MaxWords": max_words, - "MinWords": min_words, - "ShortWord": short_word, - "HighlightAll": highlight_all, - "MaxFragments": max_fragments, - "FragmentDelimiter": fragment_delimiter, - } - self.options = { - option: value for option, value in options.items() if value is not None - } - expressions = (expression, query) - if config is not None: - config = SearchConfig.from_parameter(config) - expressions = (config,) + expressions - Func.__init__(self, *expressions) + output_field = SearchVectorField() diff --git a/src/search/views.py b/src/search/views.py index 5acbffa85..6ff0f7ab6 100644 --- a/src/search/views.py +++ b/src/search/views.py @@ -11,6 +11,7 @@ from catalogue.models import Book, Tag from pdcounter.models import Author from picture.models import Picture from search.index import Search, SearchResult, PictureResult +from .forms import SearchFilters from suggest.forms import PublishingSuggestForm import re import json @@ -116,8 +117,30 @@ def hint(request, mozhint=False, param='term'): return JsonResponse(data, safe=False) + +@cache.never_cache +def search(request): + filters = SearchFilters(request.GET) + ctx = { + 'title': 'Wynik wyszukiwania', + 'query': filters.data['q'], + 'filters': filters, + } + if filters.is_valid(): + ctx['results'] = filters.results() + for k, v in ctx['results'].items(): + if v: + ctx['hasresults'] = True + break + return render(request, 'search/results.html', ctx) + + @cache.never_cache def main(request): + if request.EXPERIMENTS['search'].value: + request.EXPERIMENTS['layout'].override(True) + return search(request) + query = request.GET.get('q', '') format = request.GET.get('format') diff --git a/src/wolnelektury/settings/custom.py b/src/wolnelektury/settings/custom.py index 07c1bfee8..11e29c6e1 100644 --- a/src/wolnelektury/settings/custom.py +++ b/src/wolnelektury/settings/custom.py @@ -66,7 +66,8 @@ CIVICRM_ACTIVITIES = { 'Failed contribution': 'Nieudana wpłata', } -EXPERIMENTS_LAYOUT = 0 +EXPERIMENTS_LAYOUT = 1 EXPERIMENTS_SOWKA = 0 +EXPERIMENTS_SEARCH = 0 WIDGETS = {} diff --git a/src/wolnelektury/static/2022/styles/components/_search.scss b/src/wolnelektury/static/2022/styles/components/_search.scss index 1063d6b24..1c1369a15 100644 --- a/src/wolnelektury/static/2022/styles/components/_search.scss +++ b/src/wolnelektury/static/2022/styles/components/_search.scss @@ -107,6 +107,7 @@ > div { display: flex; + flex-wrap: wrap; gap: 20px; margin-top: 26px; > div { -- 2.20.1