Search
authorRadek Czajka <rczajka@rczajka.pl>
Mon, 5 Jun 2023 11:01:27 +0000 (13:01 +0200)
committerRadek Czajka <rczajka@rczajka.pl>
Mon, 5 Jun 2023 11:01:27 +0000 (13:01 +0200)
src/catalogue/migrations/0045_snippet_search_vector_idx.py [new file with mode: 0644]
src/catalogue/models/snippet.py
src/experiments/experiments.py
src/pdcounter/models.py
src/search/forms.py
src/search/templates/search/results.html
src/search/utils.py
src/search/views.py
src/wolnelektury/settings/custom.py
src/wolnelektury/static/2022/styles/components/_search.scss

diff --git a/src/catalogue/migrations/0045_snippet_search_vector_idx.py b/src/catalogue/migrations/0045_snippet_search_vector_idx.py
new file mode 100644 (file)
index 0000000..a89fdb7
--- /dev/null
@@ -0,0 +1,19 @@
+# Generated by Django 4.0.8 on 2023-06-02 13:08
+
+import django.contrib.postgres.indexes
+from django.db import migrations
+import django.db.models.expressions
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('catalogue', '0044_snippet'),
+    ]
+
+    operations = [
+        migrations.AddIndex(
+            model_name='snippet',
+            index=django.contrib.postgres.indexes.GinIndex(django.db.models.expressions.F('search_vector'), name='search_vector_idx'),
+        ),
+    ]
index 3c9384c..4c25b8c 100644 (file)
@@ -1,23 +1,27 @@
 from django.db import models
+from django.contrib.postgres.indexes import GinIndex
 from django.contrib.postgres.search import SearchVector, SearchVectorField
-from search.utils import build_search_vector
+from search.utils import UnaccentSearchVector
 
 
 class Snippet(models.Model):
     book = models.ForeignKey('Book', models.CASCADE)
     sec = models.IntegerField()
-    # header_type ?
-    # header_span ?
     text = models.TextField()
     search_vector = SearchVectorField()
 
+    class Meta:
+        indexes = [
+            GinIndex('search_vector', name='search_vector_idx'),
+        ]
+            
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
         if not self.search_vector:
             self.update()
 
     def update(self):
-        self.search_vector = build_search_vector('text')
+        self.search_vector = UnaccentSearchVector('text')
         self.save()
 
     @classmethod
index 15c103f..60d0521 100644 (file)
@@ -7,7 +7,7 @@ from .base import Experiment
 class NewLayout(Experiment):
     slug = 'layout'
     name = 'Nowy layout strony'
-    size = 1 or settings.EXPERIMENTS_LAYOUT
+    size = settings.EXPERIMENTS_LAYOUT
 
     def qualify(self, request):
         if get_language() != 'pl':
@@ -21,7 +21,14 @@ class Sowka(Experiment):
     switchable = False
 
 
+class Search(Experiment):
+    slug = 'search'
+    name = 'Nowa wyszukiwarka'
+    size = settings.EXPERIMENTS_SEARCH
+
+
 experiments = [
     NewLayout,
     Sowka,
+    Search
 ]
index 3ce7f72..2e1e0b9 100644 (file)
@@ -1,12 +1,14 @@
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
+from django.apps import apps
 from django.conf import settings
 from django.db import models
 from django.urls import reverse
 from django.utils.translation import gettext_lazy as _
 from datetime import datetime
 from django.db.models.signals import post_save, post_delete
+from search.utils import UnaccentSearchVector
 
 
 class Author(models.Model):
@@ -41,6 +43,18 @@ class Author(models.Model):
     has_description.short_description = _('description')
     has_description.boolean = True
 
+    @classmethod
+    def search(cls, query, qs=None):
+        Tag = apps.get_model('catalogue', 'Tag')
+        if qs is None:
+            qs = cls.objects.all()
+        pd_authors = qs.annotate(search_vector=UnaccentSearchVector('name')).filter(search_vector=query)
+        existing_slugs = Tag.objects.filter(
+            category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
+            .values_list('slug', flat=True)
+        pd_authors = pd_authors.exclude(slug__in=existing_slugs)
+        return pd_authors
+
     def alive(self):
         return self.death is None
 
@@ -72,6 +86,18 @@ class BookStub(models.Model):
     def __str__(self):
         return self.title
 
+    @classmethod
+    def search(cls, query, qs=None):
+        Book = apps.get_model('catalogue', 'Book')
+        if qs is None:
+            qs = cls.objects.all()
+        pd_books = qs.annotate(search_vector=UnaccentSearchVector('title')).filter(search_vector=query)
+        existing_slugs = Book.objects.filter(
+            slug__in=list(pd_books.values_list('slug', flat=True))) \
+            .values_list('slug', flat=True)
+        pd_books = pd_books.exclude(slug__in=existing_slugs)
+        return pd_books
+
     def get_absolute_url(self):
         return reverse('book_detail', args=[self.slug])
 
index 7efd747..f3bf0c0 100644 (file)
@@ -2,12 +2,16 @@
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 from django.apps import apps
+from django.conf import settings
 from django.contrib.postgres.search import SearchHeadline, SearchRank, SearchQuery
 from django import forms
 from django.utils.translation import gettext_lazy as _
-
+from catalogue.constants import LANGUAGES_3TO2
+import catalogue.models
+import pdcounter.models
+import picture.models
 from .fields import JQueryAutoCompleteSearchField, InlineRadioWidget
-from .utils import build_search_query
+from .utils import UnaccentSearchQuery, UnaccentSearchVector
 
 
 class SearchForm(forms.Form):
@@ -25,7 +29,10 @@ class SearchForm(forms.Form):
 
 
 class SearchFilters(forms.Form):
-    q = forms.CharField(required=False, widget=forms.HiddenInput())
+    q = forms.CharField(
+        required=False, widget=forms.HiddenInput(),
+        min_length=2, max_length=256,
+    )
     format = forms.ChoiceField(required=False, choices=[
         ('', 'wszystkie'),
         ('text', 'tekst'),
@@ -51,89 +58,98 @@ class SearchFilters(forms.Form):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        from catalogue.models import Book, Tag
 
+        langs = dict(settings.LANGUAGES)
         self.fields['lang'].choices = [('', 'wszystkie')] + [
-            (b, b)
-            for b in Book.objects.values_list(
+            (
+                b,
+                langs.get(LANGUAGES_3TO2.get(b, b), b)
+            )
+            for b in catalogue.models.Book.objects.values_list(
                     'language', flat=True
             ).distinct().order_by()
         ]
         self.fields['epoch'].choices = [('', 'wszystkie')] + [
             (b.slug, b.name)
-            for b in Tag.objects.filter(category='epoch')
+            for b in catalogue.models.Tag.objects.filter(category='epoch')
         ]
         self.fields['genre'].choices = [('', 'wszystkie')] + [
             (b.slug, b.name)
-            for b in Tag.objects.filter(category='genre')
+            for b in catalogue.models.Tag.objects.filter(category='genre')
         ]
 
     def get_querysets(self):
-        Tag = apps.get_model('catalogue', 'Tag')
-        Book = apps.get_model('catalogue', 'Book')
-        Picture = apps.get_model('picture', 'Picture')
-        Snippet = apps.get_model('catalogue', 'Snippet')
-        Collection = apps.get_model('catalogue', 'Collection')
         qs = {
-            'author': Tag.objects.filter(category='author'),
-            'theme': Tag.objects.filter(category='theme'),
-            'genre': Tag.objects.filter(category='genre'),
-            'collection': Collection.objects.all(),
-            'book': Book.objects.all(), #findable
-            'snippet': Snippet.objects.all(),
-            'art': Picture.objects.all(),
+            'author': catalogue.models.Tag.objects.filter(category='author'),
+            'pdauthor': pdcounter.models.Author.objects.all(),
+            'theme': catalogue.models.Tag.objects.filter(category='theme'),
+            'genre': catalogue.models.Tag.objects.filter(category='genre'),
+            'collection': catalogue.models.Collection.objects.all(),
+            'book': catalogue.models.Book.objects.all(), #findable
+            'pdbook': pdcounter.models.BookStub.objects.all(),
+            'snippet': catalogue.models.Snippet.objects.all(),
+            'art': picture.models.Picture.objects.all(),
             # art pieces
-            # pdbooks
-            # pdauthors
         }
         if self.cleaned_data['category']:
             c = self.cleaned_data['category']
-            if c != 'author': qs['author'] = Tag.objects.none()
-            if c != 'theme': qs['theme'] = Tag.objects.none()
-            if c != 'genre': qs['genre'] = Tag.objects.none()
-            if c != 'collection': qs['collection'] = Collection.objects.none()
-            if c != 'book': qs['book'] = Book.objects.none()
-            if c != 'quote': qs['snippet'] = Snippet.objects.none()
-            if c != 'art': qs['art'] = Picture.objects.none()
+            if c != 'author':
+                qs['author'] = qs['author'].none()
+                qs['pdauthor'] = qs['pdauthor'].none()
+            if c != 'theme': qs['theme'] = qs['theme'].none()
+            if c != 'genre': qs['genre'] = qs['genre'].none()
+            if c != 'collection': qs['collection'] = qs['collection'].none()
+            if c != 'book':
+                qs['book'] = qs['book'].none()
+                qs['pdbook'] = qs['pdbook'].none()
+            if c != 'quote': qs['snippet'] = qs['snippet'].none()
+            if c != 'art': qs['art'] = qs['art'].none()
             qs['art'] = Picture.objects.none()
 
         if self.cleaned_data['format']:
             c = self.cleaned_data['format']
-            qs['author'] = Tag.objects.none()
-            qs['theme'] = Tag.objects.none()
-            qs['genre'] = Tag.objects.none()
-            qs['collection'] = Collection.objects.none()
+            qs['author'] = qs['author'].none()
+            qs['pdauthor'] = qs['pdauthor'].none()
+            qs['theme'] = qs['theme'].none()
+            qs['genre'] = qs['genrer'].none()
+            qs['collection'] = qs['collection'].none()
             if c == 'art':
-                qs['book'] = Book.objects.none()
-                qs['snippet'] = Snippet.objects.none()
+                qs['book'] = qs['book'].none()
+                qs['pdbook'] = qs['pdbook'].none()
+                qs['snippet'] = qs['snippet'].none()
             if c in ('text', 'audio', 'daisy'):
-                qs['art'] = Picture.objects.none()
+                qs['art'] = qs['art'].none()
                 if c == 'audio':
                     qs['book'] = qs['book'].filter(media__type='mp3')
+                    qs['pdbook'] = qs['book'].none()
                     qs['snippet'] = qs['snippet'].filter(book__media__type='mp3')
                 elif c == 'daisy':
                     qs['book'] = qs['book'].filter(media__type='daisy')
                     qs['snippet'] = qs['snippet'].filter(book__media__type='daisy')
 
         if self.cleaned_data['lang']:
-            qs['author'] = Tag.objects.none()
-            qs['theme'] = Tag.objects.none()
-            qs['genre'] = Tag.objects.none()
-            qs['art'] = Picture.objects.none()
-            qs['collection'] = Collection.objects.none()
+            qs['author'] = qs['author'].none()
+            qs['pdauthor'] = qs['pdauthor'].none()
+            qs['theme'] = qs['theme'].none()
+            qs['genre'] = qs['genre'].none()
+            qs['art'] = qs['art'].none()
+            qs['collection'] = qs['collection'].none()
             qs['book'] = qs['book'].filter(language=self.cleaned_data['lang'])
+            qs['pdbook'] = qs['pdbook'].none()
             qs['snippet'] = qs['snippet'].filter(book__language=self.cleaned_data['lang'])
 
         for tag_cat in ('epoch', 'genre'):
             c = self.cleaned_data[tag_cat]
             if c:
                 # FIXME nonexistent
-                t = Tag.objects.get(category=tag_cat, slug=c)
-                qs['author'] = Tag.objects.none()
-                qs['theme'] = Tag.objects.none()
-                qs['genre'] = Tag.objects.none()
-                qs['collection'] = Collection.objects.none()
+                t = catalogue.models.Tag.objects.get(category=tag_cat, slug=c)
+                qs['author'] = qs['author'].none()
+                qs['pdauthor'] = qs['pdauthor'].none()
+                qs['theme'] = qs['theme'].none()
+                qs['genre'] = qs['genre'].none()
+                qs['collection'] = qs['collection'].none()
                 qs['book'] = qs['book'].filter(tag_relations__tag=t)
+                qs['pdbook'] = qs['pdbook'].none()
                 qs['snippet'] = qs['snippet'].filter(book__tag_relations__tag=t)
                 qs['art'] = qs['art'].filter(tag_relations__tag=t)
             
@@ -142,17 +158,14 @@ class SearchFilters(forms.Form):
     def results(self):
         qs = self.get_querysets()
         query = self.cleaned_data['q']
-        squery = build_search_query(query, config='polish')
+        squery = UnaccentSearchQuery(query, config='polish')
         query = SearchQuery(query, config='polish')
-        books = qs['book'].filter(title__search=query)
+        books = qs['book'].annotate(
+            search_vector=UnaccentSearchVector('title')
+        ).filter(search_vector=squery)
         books = books.exclude(ancestor__in=books)
-        return {
-            'author': qs['author'].filter(slug__search=query),
-            'theme': qs['theme'].filter(slug__search=query),
-            'genre': qs['genre'].filter(slug__search=query),
-            'collection': qs['collection'].filter(title__search=query),
-            'book': books[:100],
-            'snippet': qs['snippet'].annotate(
+
+        snippets = qs['snippet'].annotate(
                     rank=SearchRank('search_vector', squery)
                 ).filter(rank__gt=0).order_by('-rank').annotate(
                     headline=SearchHeadline(
@@ -163,7 +176,32 @@ class SearchFilters(forms.Form):
                         stop_sel='</strong>',
                         highlight_all=True
                     )
-                )[:100],
-            'art': qs['art'].filter(title__search=query)[:100],
+                )[:100]
+        snippets_by_book = {}
+        for snippet in snippets:
+            snippet_list = snippets_by_book.setdefault(snippet.book, [])
+            if len(snippet_list) < 3:
+                snippet_list.append(snippet)
+
+        return {
+            'author': qs['author'].annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=squery),
+            'theme': qs['theme'].annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=squery),
+            'genre': qs['genre'].annotate(
+                search_vector=UnaccentSearchVector('name_pl')
+            ).filter(search_vector=squery),
+            'collection': qs['collection'].annotate(
+                search_vector=UnaccentSearchVector('title')
+            ).filter(search_vector=squery),
+            'book': books[:100],
+            'art': qs['art'].annotate(
+                search_vector=UnaccentSearchVector('title')
+            ).filter(search_vector=squery)[:100],
+            'snippet': snippets_by_book,
+            'pdauthor': pdcounter.models.Author.search(squery, qs=qs['pdauthor']),
+            'pdbook': pdcounter.models.BookStub.search(squery, qs=qs['pdbook']),
         }
 
index c6ea383..189d095 100644 (file)
       <button type="submit" class="c-form__hidden-submit">wyślij</button>
     </form>
 
+    {% if not hasresults %}
+      <p class="l-change-pop show">
+        Brak wyników.
+      </p>
+    {% endif %}
+
     {% if results.author %}
     <div class="l-container">
       <h2 class="header">Autorzy</h2>
       </div>
     {% endif %}
 
-    {% if results.fragment or results.snippet %}
+    {% if results.snippet %}
       <div class="l-container">
         <h2 class="header">W treści</h2>
-        {% for f in results.snippet %}
+        {% for book, snippets in results.snippet.items %}
           <div class="c-search-result-fragment">
-            {% for author in f.book.authors %}
+            {% for author in book.authors %}
               <a class="c-search-result-fragment-author" href="{{ author.get_absolute_url }}">{{ author }}</a>
             {% endfor %}
-            <a class="c-search-result-fragment-title" href="{{ f.book.get_absolute_url }}">
-              {{ f.book.title }}
-            </a>
-            <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#sec{{ f.sec }}'>
-              {{ f.headline|safe }}
+            <a class="c-search-result-fragment-title" href="{{ book.get_absolute_url }}">
+              {{ book.title }}
             </a>
+            {% for f in snippets %}
+              <a class="c-search-result-fragment-text" href='{% url 'book_text' f.book.slug %}#sec{{ f.sec }}'>
+                {{ f.headline|safe }}
+              </a>
+            {% endfor %}
           </div>
         {% endfor %}
       </div>
       </div>
     {% endif %}
 
-    {% if pd_authors %}
+    {% if results.pdauthor or results.pdbook %}
       <div class="l-container">
         <div class="c-search-result-pd">
           <h2>Domena publiczna?</h2>
             Dowiedz się, dlaczego biblioteki internetowe nie mogą udostępniać dzieł tego autora.
           </p>
           <div>
-            {% for tag in pd_authors %}
+            {% for tag in results.pdauthor %}
               <div><a href="{{ tag.get_absolute_url }}">
                 <strong>{{ tag }}</strong>
-                Dzieła tego autora będą mogły być publikowane bez ograniczeń w&nbsp;roku&nbsp;<em>{{ tag.goes_to_pd }}</em>.
+                {% if tag.death  %}
+                  {% if tag.in_pd %}
+                    Dzieła tego autora są w&nbsp;domenie publicznej i&nbsp;czekają na publikację.
+                  {% else %}
+                    Dzieła tego autora będą mogły być publikowane bez ograniczeń w&nbsp;roku&nbsp;<em>{{ tag.goes_to_pd }}</em>.
+                  {% endif %}
+                {% else %}
+                  Dzieła tego autora są objęte prawem autorskim.
+                {% endif %}
+              </a></div>
+            {% endfor %}
+            {% for book in results.pdbook %}
+              <div><a href="{{ book.get_absolute_url }}">
+                <strong>{{ book }}</strong>
+                {% if book.pd  %}
+                  {% if book.in_pd %}
+                    Ten utwór jest w&nbsp;domenie publicznej i&nbsp;czeka na publikację.
+                  {% else %}
+                    Ten utwór będzie mógł być publikowany bez ograniczeń w&nbsp;roku&nbsp;<em>{{ book.pd }}</em>.
+                  {% endif %}
+                {% else %}
+                  Ten utwór nie jest jeszcze w domenie publicznej.
+                {% endif %}
               </a></div>
             {% endfor %}
           </div>
index b2cbe8d..6c0acf5 100644 (file)
@@ -1,29 +1,23 @@
 from django.db.models import Func
-from django.contrib.postgres.search import SearchVector, SearchQuery, SearchQueryField, SearchHeadline as SH
+from django.contrib.postgres.search import SearchQuery, SearchVectorField
 
 
+class UnaccentSearchQuery(SearchQuery):
+    '''
+    The idea is to run unaccent *after* the query is already passed through the language dictionary.
+    '''
+    def as_sql(self, *args, **kwargs):
+        sql, params = super().as_sql(*args, **kwargs)
+        sql = f'unaccent({sql}::text)::tsquery'
+        return sql, params
 
-class UnaccentTSVector(Func):
-    function = 'UNACCENT'
-    template = '%(function)s(%(expressions)s::text)::tsvector'
 
-
-class Unaccent(Func):
-    function = 'UNACCENT'
-
-    
-class ConcatTSVector(Func):
-    function = 'CONCAT'
-    template = '%(function)s(%(expressions)s)::tsvector'    
-
-
-class UnaccentTSQuery(Func):
-    function = 'UNACCENT'
-    template = '%(function)s(%(expressions)s::text)::tsquery'
-    output_field = SearchQueryField()
-
-
-class TSV(Func):
+class UnaccentSearchVector(Func):
+    '''
+    We do the indexing twice, to account for non-diacritic versions.
+    For example: user enters 'róże' -> stem to 'róża' -> unaccent to 'roza'.
+    But user enters 'roze' -> stem leaves it as is, so we need original form in the vector.
+    '''
     function='to_tsvector'
     template = '''unaccent(
       %(function)s('polish', %(expressions)s)::text)::tsvector ||
@@ -31,49 +25,4 @@ class TSV(Func):
        'polish_simple', 
        unaccent(%(expressions)s)
      )'''
-
-
-def build_search_vector(*fields):
-    return TSV(*fields)
-
-
-def build_search_query(*fields, **kwargs):
-    return UnaccentTSQuery(SearchQuery(*fields, **kwargs))
-
-
-
-class SearchHeadline(SH):
-
-    def __init__(
-        self,
-        expression,
-        query,
-        *,
-        config=None,
-        start_sel=None,
-        stop_sel=None,
-        max_words=None,
-        min_words=None,
-        short_word=None,
-        highlight_all=None,
-        max_fragments=None,
-        fragment_delimiter=None,
-    ):
-        options = {
-            "StartSel": start_sel,
-            "StopSel": stop_sel,
-            "MaxWords": max_words,
-            "MinWords": min_words,
-            "ShortWord": short_word,
-            "HighlightAll": highlight_all,
-            "MaxFragments": max_fragments,
-            "FragmentDelimiter": fragment_delimiter,
-        }
-        self.options = {
-            option: value for option, value in options.items() if value is not None
-        }
-        expressions = (expression, query)
-        if config is not None:
-            config = SearchConfig.from_parameter(config)
-            expressions = (config,) + expressions
-        Func.__init__(self, *expressions)
+    output_field = SearchVectorField()
index 5acbffa..6ff0f7a 100644 (file)
@@ -11,6 +11,7 @@ from catalogue.models import Book, Tag
 from pdcounter.models import Author
 from picture.models import Picture
 from search.index import Search, SearchResult, PictureResult
+from .forms import SearchFilters
 from suggest.forms import PublishingSuggestForm
 import re
 import json
@@ -116,8 +117,30 @@ def hint(request, mozhint=False, param='term'):
         return JsonResponse(data, safe=False)
 
 
+
+@cache.never_cache
+def search(request):
+    filters = SearchFilters(request.GET)
+    ctx = {
+        'title': 'Wynik wyszukiwania',
+        'query': filters.data['q'],
+        'filters': filters,
+    }
+    if filters.is_valid():
+        ctx['results'] = filters.results()
+        for k, v in ctx['results'].items():
+            if v:
+                ctx['hasresults'] = True
+                break
+    return render(request, 'search/results.html', ctx)
+
+
 @cache.never_cache
 def main(request):
+    if request.EXPERIMENTS['search'].value:
+        request.EXPERIMENTS['layout'].override(True)
+        return search(request)
+
     query = request.GET.get('q', '')
 
     format = request.GET.get('format')
index 07c1bfe..11e29c6 100644 (file)
@@ -66,7 +66,8 @@ CIVICRM_ACTIVITIES = {
     'Failed contribution': 'Nieudana wpłata',
 }
 
-EXPERIMENTS_LAYOUT = 0
+EXPERIMENTS_LAYOUT = 1
 EXPERIMENTS_SOWKA = 0
+EXPERIMENTS_SEARCH = 0
 
 WIDGETS = {}
index 1063d6b..1c1369a 100644 (file)
     
     > div {
         display: flex;
+        flex-wrap: wrap;
         gap: 20px;
         margin-top: 26px;
         > div {