pip-log.txt
nosetests.xml
/htmlcov
+.python-version
# Mac OS X garbage
.DS_Store
#pyoai==2.5.1
-e git+https://github.com/infrae/pyoai@5ff2f15e869869e70d8139e4c37b7832854d7049#egg=pyoai
-scorched==0.13
-httplib2
-
sentry-sdk==0.10.2
requests
{
"url": "http://testserver/katalog/lektury/a-collection/",
+ "authors": [],
"books": [
{
"kind": "Liryka",
from catalogue.models import Book
from picture.models import Picture
-from search.index import Index
class Command(BaseCommand):
parser.add_argument(
'-D', '--dont-build', dest='dont_build', metavar="FORMAT,...",
help="Skip building specified formats")
- parser.add_argument(
- '-S', '--no-search-index', action='store_false',
- dest='search_index', default=True,
- help='Skip indexing imported works for search')
parser.add_argument(
'-F', '--not-findable', action='store_false',
dest='findable', default=True,
file_base, ext = os.path.splitext(file_path)
book = Book.from_xml_file(file_path, overwrite=options.get('force'),
dont_build=dont_build,
- search_index_tags=False,
findable=options.get('findable'),
remote_gallery_url='file://' + os.path.dirname(os.path.abspath(file_base)) + '/img/'
)
verbose = options.get('verbose')
import_picture = options.get('import_picture')
- if options.get('search_index') and not settings.NO_SEARCH_INDEX:
- index = Index()
- try:
- index.index_tags()
- index.index.commit()
- except Exception as e:
- index.index.rollback()
- raise e
-
files_imported = 0
files_skipped = 0
})
return create_zip(paths, "%s_%s" % (self.slug, format_), {'informacje.txt': readme})
- def search_index(self, book_info=None, index=None, index_tags=True, commit=True):
+ def search_index(self, index=None):
if not self.findable:
return
- if index is None:
- from search.index import Index
- index = Index()
- try:
- index.index_book(self, book_info)
- if index_tags:
- index.index_tags()
- if commit:
- index.index.commit()
- except Exception as e:
- index.index.rollback()
- raise e
+ from search.index import Index
+ Index.index_book(self)
# will make problems in conjunction with paid previews
def download_pictures(self, remote_gallery_url):
@classmethod
def from_text_and_meta(cls, raw_file, book_info, overwrite=False, dont_build=None, search_index=True,
- search_index_tags=True, remote_gallery_url=None, days=0, findable=True):
+ remote_gallery_url=None, days=0, findable=True):
from catalogue import tasks
if dont_build is None:
getattr(book, '%s_file' % format_).build_delay()
if not settings.NO_SEARCH_INDEX and search_index and findable:
- tasks.index_book.delay(book.id, book_info=book_info, index_tags=search_index_tags)
+ tasks.index_book.delay(book.id)
for child in notify_cover_changed:
child.parent_cover_changed()
def book_delete(sender, instance, **kwargs):
caches[settings.CACHE_MIDDLEWARE_ALIAS].clear()
- if not settings.NO_SEARCH_INDEX:
- # remove the book from search index, when it is deleted.
- from search.index import Index
- idx = Index()
- idx.remove_book(instance)
- idx.index_tags()
-
####
# Tag
@shared_task
-def index_book(book_id, book_info=None, **kwargs):
+def index_book(book_id, **kwargs):
try:
- return Book.objects.get(id=book_id).search_index(book_info, **kwargs)
+ return Book.objects.get(id=book_id).search_index(**kwargs)
except Exception as e:
print("Exception during index: %s" % e)
print_exc()
+++ /dev/null
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load pagination_tags %}
-{% load inline_tag_list from catalogue_tags %}
-{% load book_searched from search_tags %}
-{% load set_get_parameter %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
-
- <h1>{% trans "Search" %}</h1>
-
- <div class="white-box">
-
- <p class="search-filter">
- <strong>format:</strong>
- {% if not set.format %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter format='' %}">dowolny</a>{% endif %}
- {% if set.format == "tekst" %}<em>tekst</em>{% else %}<a href="{% set_get_parameter format='tekst' %}">tekst</a>{% endif %}
- {% if set.format == "audio" %}<em>audiobook</em>{% else %}<a href="{% set_get_parameter format='audio' %}">audiobook</a>{% endif %}
- {% if set.format == "synchro" %}<em>DAISY</em>{% else %}<a href="{% set_get_parameter format='synchro' %}">DAISY</a>{% endif %}
- {% if set.format == "obraz" %}<em>obraz</em>{% else %}<a href="{% set_get_parameter format='obraz' %}">obraz</a>{% endif %}
- </p>
-
- <p class="search-filter">
- <strong>{% trans "language" %}: </strong>
- {% if not set.lang %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter lang='' %}">dowolny</a>{% endif %}
- {% if set.lang == "pol" %}<em>polski</em>{% else %}<a href="{% set_get_parameter lang='pol' %}">polski</a>{% endif %}
- {% if set.lang == "eng" %}<em>angielski</em>{% else %}<a href="{% set_get_parameter lang='eng' %}">angielski</a>{% endif %}
- {% if set.lang == "fre" %}<em>francuski</em>{% else %}<a href="{% set_get_parameter lang='fre' %}">francuski</a>{% endif %}
- {% if set.lang == "ger" %}<em>niemiecki</em>{% else %}<a href="{% set_get_parameter lang='ger' %}">niemiecki</a>{% endif %}
- {% if set.lang == "lit" %}<em>litewski</em>{% else %}<a href="{% set_get_parameter lang='lit' %}">litewski</a>{% endif %}
- </p>
-
- </p>
- <p class="search-filter">
- <strong>{% trans "epoch" %}: </strong>
- {% if not set.epoch %}<em>dowolna</em>{% else %}<a href="{% set_get_parameter epoch='' %}">dowolna</a>{% endif %}
-
- {% for tag in tags.epoch %}
- {% if set.epoch == tag.slug %}
- <em>{{ tag.name }}</em>
- {% else %}
- <a href="{% set_get_parameter epoch=tag.slug %}">
- {{ tag.name }}
- </a>
- {% endif %}
- {% endfor %}
- </p>
- <p class="search-filter">
- <strong>{% trans "kind" %}: </strong>
- {% if not set.kind %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter kind='' %}">dowolny</a>{% endif %}
- {% for tag in tags.kind %}
- {% if set.kind == tag.slug %}
- <em>{{ tag.name }}</em>
- {% else %}
- <a href="{% set_get_parameter kind=tag.slug %}">
- {{ tag.name }}
- </a>
- {% endif %}
- {% endfor %}
- </p>
-
- {% comment %}
- <p class="search-filter">
- <strong>{% trans "genre" %}: </strong>
- {% if not set.genre %}<em>dowolny</em>{% else %}<a href="{% set_get_parameter genre='' %}">dowolny</a>{% endif %}
- {% for tag in tags.genre %}
- {% if set.genre == tag.slug %}
- <em>{{ tag.name }}</em>
- {% else %}
- <a href="{% set_get_parameter genre=tag.slug %}">
- {{ tag.name }}
- </a>
- {% endif %}
- {% endfor %}
- </p>
- {% endcomment %}
- </div>
-
- {% if did_you_mean %}
- <span class="did_you_mean">{% trans "Did you mean" %}
- <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
- {% endif %}
- <div class="top-tag-list">
- {% if pd_authors %}
- <div>
- {% for author in pd_authors %}
- <div class="tag-box">
- {% include "pdcounter/pd_author_box.html" %}
- </div>
- {% endfor %}
- </div>
- {% endif %}
- </div>
-
- <div>
- {% if books %}
- <ul class="work-list">
- {% if pictures %}
- <h1>{% trans "Books" %}</h1>
- {% endif %}
- {% for result in books %}
- <li class="Book-item">
- <div class="search-result">
- {% book_searched result %}
- </div>
- </li>
- {% endfor %}
- </ul>
- {% endif %}
-
- {% if pictures %}
- <h1>{% trans "Art" %}</h1>
- <ul class="work-list">
- {% for result in pictures %}
- <li class="Picture-item">
- <div class="search-result">
- {% with result.picture as picture %}
- {% include "picture/picture_searched.html" %}
- {% endwith %}
- </div>
- </li>
- {% endfor %}
- </ul>
- {% endif %}
- </div>
-{% endblock %}
+++ /dev/null
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
- <h1>{% trans "Search" %}</h1>
-
- <div class="left-column">
- <div class="normal-text">
- <p>
- {% if did_you_mean %}
- <span class="did_you_mean">{% trans "Did you mean" %}
- <a href="{% url 'search' %}?q={{did_you_mean|urlencode}}">{{did_you_mean|lower}}</a>?</span>
- {% endif %}
- </p>
- <p>{% trans "Sorry! Search cirteria did not match any resources." %}</p>
-
- {% include "info/join_us.html" %}
- </div>
- </div>
-
- <div class="right-column">
- {% include "publishing_suggest.html" %}
- </div>
-{% endblock %}
+++ /dev/null
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
- <h1>{% trans "Search" %}</h1>
-
- <div id="books-list">
- <p>{% trans "Sorry! Search query is too long to be processed." %}</p>
- {% include "info/join_us.html" %}
- </div>
-{% endblock %}
\ No newline at end of file
+++ /dev/null
-{% extends "base/base.html" %}
-{% load i18n %}
-{% load catalogue_tags pagination_tags %}
-
-{% block titleextra %}{% trans "Search" %}{% endblock %}
-
-{% block bodyid %}tagged-object-list{% endblock %}
-
-{% block body %}
- <h1>{% trans "Search" %}</h1>
-
- <div id="books-list">
- <p>{% trans "Sorry! Search query must have at least two characters." %}</p>
- {% include "info/join_us.html" %}
- </div>
-{% endblock %}
\ No newline at end of file
CACHES={
'default': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
},
- SOLR=settings.SOLR_TEST,
)
class WLTestCase(TestCase):
"""
language = 'pol'
return {
'title': str(title),
- 'url': WLURI.from_slug(slug),
+ 'url': WLURI(slug),
'about': "http://wolnelektury.pl/example/URI/%s" % slug,
'language': language,
}
def setUp(self):
WLTestCase.setUp(self)
self.book_info = BookInfoStub(
- url=WLURI.from_slug("default-book"),
+ url=WLURI("default-book"),
about="http://wolnelektury.pl/example/URI/default_book",
title="Default Book",
author=PersonStub(("Jim",), "Lazy"),
def test_book_with_invalid_slug(self):
""" Book with invalid characters in slug shouldn't be imported """
- self.book_info.url = WLURI.from_slug("default_book")
+ self.book_info.url = WLURI("default_book")
book_text = "<utwor />"
with self.assertRaises(ValueError):
models.Book.from_text_and_meta(ContentFile(book_text), self.book_info)
class MultilingualBookImportTest(WLTestCase):
def setUp(self):
WLTestCase.setUp(self)
- common_uri = WLURI.from_slug('common-slug')
+ common_uri = WLURI('common-slug')
self.pol_info = BookInfoStub(
genre='X-Genre',
from os.path import basename, exists
from unittest import skip
-from django.core.files.base import ContentFile
+from django.core.files.base import ContentFile, File
from catalogue.test_utils import *
from catalogue import models, utils
self.file = ContentFile(b'X')
self.file2 = ContentFile(b'Y')
self.book = models.Book.objects.create(slug='test-book', title='Test')
+ with open(join(dirname(__file__), "files/fraszka-do-anusie.xml")) as f:
+ self.book.xml_file.save(None, File(f))
def set_title(self, title):
self.book.title = title
def test_empty(self):
""" empty tag should have no related tags """
- cats = self.client.get('/katalog/autor/empty/').context['categories']
- self.assertEqual({k: v for (k, v) in cats.items() if v}, {}, 'tags related to empty tag')
+ suggested = self.client.get('/katalog/autor/empty/').context['suggested_tags']
+ self.assertEqual(suggested, [], 'tags related to empty tag')
def test_has_related(self):
""" related own and descendants' tags should be generated """
- cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
- self.assertTrue('Common Man' in [tag.name for tag in cats['author']],
+ suggested = {
+ (t.name, t.category)
+ for t in self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+ }
+ self.assertTrue(('Common Man', 'author') in suggested,
'missing `author` related tag')
- self.assertTrue('Epoch' in [tag.name for tag in cats['epoch']],
+ self.assertTrue(('Epoch', 'epoch') in suggested,
'missing `epoch` related tag')
- self.assertFalse(cats.get("kind", False),
+ # TODO: this should probably be changed now.
+ self.assertFalse(any(x for x in suggested if x[1] == "kind"),
"There should be no child-only related `kind` tags")
- self.assertTrue("Genre" in [tag.name for tag in cats['genre']],
+ self.assertTrue(("Genre", 'genre') in suggested,
'missing `genre` related tag')
- self.assertFalse("ChildGenre" in [tag.name for tag in cats['genre']],
+ # TODO: this should probably be changed now.
+ self.assertFalse(("ChildGenre", 'genre') in suggested,
"There should be no child-only related `genre` tags")
- self.assertTrue("GchildGenre" in [tag.name for tag in cats['genre']],
+ self.assertTrue(("GchildGenre", "genre") in suggested,
"missing grandchild's related tag")
- self.assertTrue('Theme' in [tag.name for tag in cats['theme']],
+ self.assertTrue(('Theme', 'theme') in suggested,
"missing related theme")
- self.assertFalse('Child1Theme' in [tag.name for tag in cats['theme']],
- "There should be no child-only related `theme` tags")
- self.assertTrue('GChildTheme' in [tag.name for tag in cats['theme']],
+ self.assertTrue(('Child1Theme', 'theme') in suggested,
+ "missing child's related theme")
+ self.assertTrue(('GChildTheme', 'theme') in suggested,
"missing grandchild's related theme")
def test_related_differ(self):
""" related tags shouldn't include filtering tags """
response = self.client.get('/katalog/rodzaj/kind/')
- cats = response.context['categories']
- self.assertFalse(cats.get('kind', False),
+ suggested = response.context['suggested_tags']
+ self.assertFalse(any(x for x in suggested if x.category == 'kind'),
'filtering tag wrongly included in related')
- cats = self.client.get('/katalog/motyw/theme/').context['categories']
- self.assertFalse('Theme' in [tag.name for tag in cats['theme']],
+ suggested = {
+ (t.name, t.category)
+ for t in self.client.get(
+ '/katalog/motyw/theme/').context['suggested_tags']
+ }
+ self.assertFalse(('Theme', 'theme') in suggested,
'filtering theme wrongly included in related')
def test_parent_tag_once(self):
""" if parent and descendants have a common tag, count it only once """
- cats = self.client.get('/katalog/rodzaj/kind/').context['categories']
- self.assertEqual([(tag.name, tag.count) for tag in cats['epoch']],
+ suggested = self.client.get('/katalog/rodzaj/kind/').context['suggested_tags']
+ self.assertEqual([(tag.name, tag.count) for tag in suggested if tag.category == 'epoch'],
[('Epoch', 1)],
'wrong related tag epoch tag on tag page')
def test_siblings_tags_count(self):
""" if children have tags and parent hasn't, count the children """
- cats = self.client.get('/katalog/epoka/epoch/').context['categories']
+ suggested = self.client.get('/katalog/epoka/epoch/').context['suggested_tags']
+ kinds = [(tag.name, tag.count) for tag in suggested if tag.category == 'kind']
self.assertTrue(
- ('ChildKind', 2) in [(tag.name, tag.count) for tag in cats['kind']],
- 'wrong related kind tags on tag page, got: ' +
- str([(tag.name, tag.count) for tag in cats['kind']]))
+ ('ChildKind', 2) in kinds,
+ 'wrong related kind tags on tag page'
+ )
# all occurencies of theme should be counted
- self.assertTrue(('Theme', 4) in [(tag.name, tag.count) for tag in cats['theme']],
- 'wrong related theme count')
+ themes = [(tag.name, tag.count) for tag in suggested if tag.category == 'theme']
+ self.assertTrue(
+ ('Theme', 4) in themes,
+ 'wrong related theme count'
+ )
def test_query_child_tag(self):
"""
If child and parent have a common tag, but parent isn't included
in the result, child should still count.
"""
- cats = self.client.get('/katalog/gatunek/childgenre/').context['categories']
- self.assertTrue(('Epoch', 2) in [(tag.name, tag.count) for tag in cats['epoch']],
- 'wrong related kind tags on tag page, got: ' +
- str([(tag.name, tag.count) for tag in cats['epoch']]))
+ suggested = self.client.get('/katalog/gatunek/childgenre/').context['suggested_tags']
+ epochs = [(tag.name, tag.count) for tag in suggested if tag.category == 'epoch']
+ self.assertTrue(
+ ('Epoch', 2) in epochs,
+ 'wrong related kind tags on tag page'
+ )
class CleanTagRelationTests(WLTestCase):
""" there should be no related tags left after deleting some objects """
models.Book.objects.all().delete()
- cats = self.client.get('/katalog/rodzaj/k/').context['categories']
- self.assertEqual({k: v for (k, v) in cats.items() if v}, {})
+ suggested = self.client.get('/katalog/rodzaj/k/').context['suggested_tags']
+ self.assertEqual(suggested, [])
self.assertEqual(models.Fragment.objects.all().count(), 0,
"orphaned fragments left")
self.assertEqual(models.Tag.intermediary_table_model.objects.all().count(), 0,
self.book_info)
categories = {'author': 'autor', 'theme': 'motyw', 'epoch': 'epoka', 'kind': 'rodzaj', 'genre': 'gatunek'}
for cat, localcat in categories.items():
+ if cat == 'theme': continue
context = self.client.get('/katalog/%s/tag/' % localcat).context
self.assertEqual(1, len(context['object_list']))
- self.assertNotEqual({}, context['categories'])
- self.assertFalse(context['categories'].get(cat, False))
+ self.assertNotEqual([], context['suggested_tags'])
+ self.assertFalse(any(t for t in context['suggested_tags'] if t.category == cat))
class BookTagsTests(WLTestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
club = models.Club.objects.first()
- self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
+ if club is not None:
+ self.fields['custom_amount'].widget.attrs['min'] = club.min_amount
def clean(self):
state = {}
def last_amount_wide(apps, schema_editor):
SingleAmount = apps.get_model('club', 'SingleAmount')
a = SingleAmount.objects.last()
- a.wide = True
- a.save()
+ if a is not None:
+ a.wide = True
+ a.save()
class Migration(migrations.Migration):
# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from unittest import skipIf
from lxml import etree
from django.conf import settings
import catalogue
from catalogue.test_utils import WLTestCase, get_fixture
from catalogue.models import Book
from librarian import WLURI, XMLNamespace
-from search.index import Index
AtomNS = XMLNamespace("http://www.w3.org/2005/Atom")
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False), 'Requires search server and NO_SEARCH_INDEX=False.')
class OpdsSearchTests(WLTestCase):
"""Tests search feed in OPDS.."""
def setUp(self):
WLTestCase.setUp(self)
- index = Index()
- index.index.delete_all()
- index.index.commit()
self.do_doktora = Book.from_xml_file(
get_fixture('do-doktora.xml'))
tree = etree.fromstring(
self.client.get('/opds/search/?%s' % query).content)
elem_ids = tree.findall('.//%s/%s' % (AtomNS('entry'), AtomNS('id')))
- slugs = [WLURI(elem.text).slug for elem in elem_ids]
+ slugs = [WLURI.from_text(elem.text).slug for elem in elem_ids]
self.assertEqual(set(slugs), set(b.slug for b in books), "OPDS search '%s' failed." % query)
def test_opds_search_simple(self):
from basicauth import logged_in_or_basicauth, factory_decorator
from catalogue.models import Book, Tag
+from search.utils import UnaccentSearchQuery, UnaccentSearchVector
-from search.views import Search
import operator
import logging
import re
'text': (10, 11),
}
- PARAMS_TO_FIELDS = {
- 'author': 'authors',
- 'translator': 'translators',
- # 'title': 'title',
- 'categories': 'tag_name_pl',
- 'description': 'text',
- # 'text': 'text',
- }
-
ATOM_PLACEHOLDER = re.compile(r"^{(atom|opds):\w+}$")
def get_object(self, request):
# query is set above.
log.debug("Inline query = [%s], criteria: %s" % (query, criteria))
- srch = Search()
-
- book_hit_filter = srch.index.Q(book_id__any=True)
- filters = [book_hit_filter] + [srch.index.Q(
- **{self.PARAMS_TO_FIELDS.get(cn, cn): criteria[cn]}
- ) for cn in self.MATCHES.keys() if cn in criteria
- if criteria[cn]]
-
+ books = Book.objects.filter(findable=True).annotate(
+ search_vector=UnaccentSearchVector('title')
+ )
if query:
- q = srch.index.query(
- reduce(
- operator.or_,
- [srch.index.Q(**{self.PARAMS_TO_FIELDS.get(cn, cn): query}) for cn in self.MATCHES.keys()],
- srch.index.Q()))
- else:
- q = srch.index.query(srch.index.Q())
-
- q = srch.apply_filters(q, filters).field_limit(score=True, fields=['book_id'])
- results = q.execute()
-
- book_scores = dict([(r['book_id'], r['score']) for r in results])
- books = Book.objects.filter(findable=True, id__in=set([r['book_id'] for r in results]))
- books = list(books)
- books.sort(reverse=True, key=lambda book: book_scores[book.id])
+ squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+ books = books.filter(search_vector=squery)
+ if criteria['author']:
+ authors = Tag.objects.filter(category='author').annotate(
+ search_vector=UnaccentSearchVector('name_pl')
+ ).filter(search_vector=UnaccentSearchQuery(criteria['author'], config=settings.SEARCH_CONFIG))
+ books = books.filter(tag_relations__tag__in=authors)
+ if criteria['categories']:
+ tags = Tag.objects.filter(category__in=('genre', 'kind', 'epoch')).annotate(
+ search_vector=UnaccentSearchVector('name_pl')
+ ).filter(search_vector=UnaccentSearchQuery(criteria['categories'], config=settings.SEARCH_CONFIG))
+ books = books.filter(tag_relations__tag__in=tags)
+ if criteria['translator']:
+ # TODO
+ pass
+ if criteria['title']:
+ books = books.filter(
+ search_vector=UnaccentSearchQuery(criteria['title'], config=settings.SEARCH_CONFIG)
+ )
+
+ books = books.exclude(ancestor__in=books)
+
+ books = books.order_by('popularity__count')
return books
def get_link(self, query):
def pretty_title(self, html_links=False):
return ', '.join((self.author, self.title))
-
-
-if not settings.NO_SEARCH_INDEX:
- def update_index(sender, instance, **kwargs):
- from search.index import Index
- idx = Index()
- idx.index_tags(instance, remove_only='created' not in kwargs)
-
- post_delete.connect(update_index, Author)
- post_delete.connect(update_index, BookStub)
- post_save.connect(update_index, Author)
- post_save.connect(update_index, BookStub)
return None
@classmethod
- def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False, search_index=True):
+ def from_xml_file(cls, xml_file, image_file=None, image_store=None, overwrite=False):
"""
Import xml and it's accompanying image file.
If image file is missing, it will be fetched by librarian.picture.ImageStore
picture.xml_file.save("%s.xml" % picture.slug, File(xml_file))
picture.save()
tasks.generate_picture_html(picture.id)
- if not settings.NO_SEARCH_INDEX and search_index:
- tasks.index_picture.delay(picture.id, picture_info=picture_xml.picture_info)
if close_xml_file:
xml_file.close()
def clear_cache(self):
clear_cached_renders(self.mini_box)
clear_cached_renders(self.midi_box)
-
- def search_index(self, picture_info=None, index=None, index_tags=True, commit=True):
- if index is None:
- from search.index import Index
- index = Index()
- try:
- index.index_picture(self, picture_info)
- if index_tags:
- index.index_tags()
- if commit:
- index.index.commit()
- except Exception as e:
- index.index.rollback()
- raise e
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
import json
-from traceback import print_exc
-
from celery import shared_task
from django.core.files.base import ContentFile
from django.template.loader import render_to_string
'themes': areas_json['themes'],
})
pic.html_file.save("%s.html" % pic.slug, ContentFile(html_text))
-
-
-@shared_task
-def index_picture(picture_id, picture_info=None, **kwargs):
- from picture.models import Picture
- try:
- return Picture.objects.get(id=picture_id).search_index(picture_info, **kwargs)
- except Exception as e:
- print("Exception during index: %s" % e)
- print_exc()
- raise e
+++ /dev/null
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import re
-from urllib.parse import urlencode
-import warnings
-from httplib2 import socket
-from lxml import etree
-from scorched import connection, exc, search
-
-
-class CustomSolrConnection(connection.SolrConnection):
- def __init__(self, *args, **kw):
- super(CustomSolrConnection, self).__init__(*args, **kw)
- self.analysis_url = self.url + "analysis/field/"
-
- def analyze(self, params):
- qs = urlencode(params)
- url = "%s?%s" % (self.analysis_url, qs)
- if len(url) > self.max_length_get_url:
- warnings.warn("Long query URL encountered - POSTing instead of GETting. "
- "This query will not be cached at the HTTP layer")
- url = self.analysis_url
- kwargs = dict(
- method="POST",
- data=qs,
- headers={"Content-Type": "application/x-www-form-urlencoded"},
- )
- else:
- kwargs = dict(method="GET")
- response = self.request(url=url, **kwargs)
- if response.status_code != 200:
- raise exc.SolrError(response)
- return response.content
-
-
-class CustomSolrInterface(connection.SolrInterface):
- # just copied from parent and SolrConnection -> CustomSolrConnection
- def __init__(self, url, http_connection=None, mode='',
- retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL,
- search_timeout=()):
- """
- :param url: url to Solr
- :type url: str
- :param http_connection: optional -- already existing connection
- :type http_connection: requests connection
- :param mode: optional -- mode (readable, writable) Solr
- :type mode: str
- :param retry_timeout: optional -- timeout until retry
- :type retry_timeout: int
- :param max_length_get_url: optional -- max length until switch to post
- :type max_length_get_url: int
- :param search_timeout: (optional) How long to wait for the server to
- send data before giving up, as a float, or a
- (connect timeout, read timeout) tuple.
- :type search_timeout: float or tuple
- """
-
- self.conn = CustomSolrConnection(
- url, http_connection, mode, retry_timeout, max_length_get_url)
- self.schema = self.init_schema()
- self._datefields = self._extract_datefields(self.schema)
-
-
- def _analyze(self, **kwargs):
- if not self.conn.readable:
- raise TypeError("This Solr instance is only for writing")
- args = {
- 'analysis_showmatch': True
- }
- if 'field' in kwargs:
- args['analysis_fieldname'] = kwargs['field']
- if 'text' in kwargs:
- args['analysis_fieldvalue'] = kwargs['text']
- if 'q' in kwargs:
- args['q'] = kwargs['q']
- if 'query' in kwargs:
- args['q'] = kwargs['q']
-
- params = [
- (k.replace('_', '.'), v)
- for (k, v) in search.params_from_dict(**args)
- ]
-
- content = self.conn.analyze(params)
- doc = etree.fromstring(content)
- return doc
-
- def highlight(self, **kwargs):
- doc = self._analyze(**kwargs)
- analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
- matches = set()
- for wrd in analyzed:
- start = int(wrd.xpath("int[@name='start']")[0].text)
- end = int(wrd.xpath("int[@name='end']")[0].text)
- matches.add((start, end))
-
- if matches:
- return self.substring(
- kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
- else:
- return None
-
- def analyze(self, **kwargs):
- doc = self._analyze(**kwargs)
- terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
- terms = map(lambda n: str(n.text), terms)
- return terms
-
- def expand_margins(self, text, start, end):
- totlen = len(text)
-
- def is_boundary(x):
- ws = re.compile(r"\W", re.UNICODE)
- return bool(ws.match(x))
-
- while start > 0:
- if is_boundary(text[start - 1]):
- break
- start -= 1
-
- while end < totlen - 1:
- if is_boundary(text[end + 1]):
- break
- end += 1
-
- return start, end
-
- def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
- totlen = len(text)
- matches_margins = [
- ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
-
- # lets start with first match
- (start, end) = matches_margins[0][1]
- new_matches = [matches_margins[0][0]]
-
- for (m, (s, e)) in matches_margins[1:]:
- if end < s or start > e:
- continue
- start = min(start, s)
- end = max(end, e)
- new_matches.append(m)
-
- snip = text[start:end]
- new_matches.sort(key=lambda a: -a[0])
-
- for (s, e) in new_matches:
- off = -start
- snip = snip[:e + off] + mark[1] + snip[e + off:]
- snip = snip[:s + off] + mark[0] + snip[s + off:]
- snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)
-
- return snip
def results(self):
qs = self.get_querysets()
query = self.cleaned_data['q']
- squery = UnaccentSearchQuery(query, config='polish')
- query = SearchQuery(query, config='polish')
+ squery = UnaccentSearchQuery(query, config=settings.SEARCH_CONFIG)
+ query = SearchQuery(query, config=settings.SEARCH_CONFIG)
books = qs['book'].annotate(
search_vector=UnaccentSearchVector('title')
).filter(search_vector=squery)
headline=SearchHeadline(
'text',
query,
- config='polish',
+ config=settings.SEARCH_CONFIG,
start_sel='<strong>',
stop_sel='</strong>',
)
# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from functools import reduce, total_ordering
-from itertools import chain
-import logging
-import operator
-import os
import re
-from django.conf import settings
-from librarian import dcparser
-import librarian.meta.types.person
-import librarian.meta.types.text
from librarian.parser import WLDocument
from lxml import etree
-import scorched
-import catalogue.models
-import picture.models
-from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
-from wolnelektury.utils import makedirs
-from . import custom
-log = logging.getLogger('search')
-
-if os.path.isfile(settings.SOLR_STOPWORDS):
- stopwords = set(
- line.strip()
- for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
-else:
- stopwords = set()
-
-
-class SolrIndex(object):
- def __init__(self, mode=None):
- self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
-
-
-class Snippets(object):
- """
- This class manages snippet files for indexed object (book)
- the snippets are concatenated together, and their positions and
- lengths are kept in lucene index fields.
- """
- SNIPPET_DIR = "snippets"
-
- def __init__(self, book_id, revision=None):
- makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
- self.book_id = book_id
- self.revision = revision
- self.file = None
- self.position = None
-
- @property
- def path(self):
- if self.revision:
- fn = "%d.%d" % (self.book_id, self.revision)
- else:
- fn = "%d" % self.book_id
-
- return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
-
- def open(self, mode='r'):
- """
- Open the snippet file. Call .close() afterwards.
- """
- if 'b' not in mode:
- mode += 'b'
-
- if 'w' in mode:
- if os.path.exists(self.path):
- self.revision = 1
- while True:
- if not os.path.exists(self.path):
- break
- self.revision += 1
-
- self.file = open(self.path, mode)
- self.position = 0
- return self
-
- def add(self, snippet):
- """
- Append a snippet (unicode) to the snippet file.
- Return a (position, length) tuple
- """
- txt = snippet.encode('utf-8')
- l = len(txt)
- self.file.write(txt)
- pos = (self.position, l)
- self.position += l
- return pos
-
- def get(self, pos):
- """
- Given a tuple of (position, length) return an unicode
- of the snippet stored there.
- """
- self.file.seek(pos[0], 0)
- try:
- txt = self.file.read(pos[1]).decode('utf-8')
- except:
- return ''
- return txt
-
- def close(self):
- """Close snippet file"""
- if self.file:
- self.file.close()
-
- def remove(self):
- self.revision = None
- try:
- os.unlink(self.path)
- self.revision = 0
- while True:
- self.revision += 1
- os.unlink(self.path)
- except OSError:
- pass
-
-
-class Index(SolrIndex):
+class Index:
"""
Class indexing books.
"""
- def __init__(self):
- super(Index, self).__init__(mode='rw')
-
- def remove_snippets(self, book):
- book.snippet_set.all().delete()
-
- def add_snippet(self, book, doc):
- assert book.id == doc.pop('book_id')
- # Fragments already exist and can be indexed where they live.
- if 'fragment_anchor' in doc:
- return
-
- text = doc.pop('text')
- header_index = doc.pop('header_index')
- book.snippet_set.create(
- sec=header_index,
- text=text,
- )
-
- def delete_query(self, *queries):
- """
- index.delete(queries=...) doesn't work, so let's reimplement it
- using deletion of list of uids.
- """
- uids = set()
- for q in queries:
- if isinstance(q, scorched.search.LuceneQuery):
- q = self.index.query(q)
- q.field_limiter.update(['uid'])
- st = 0
- rows = 100
- while True:
- ids = q.paginate(start=st, rows=rows).execute()
- if not len(ids):
- break
- for res in ids:
- uids.add(res['uid'])
- st += rows
- if uids:
- # FIXME: With Solr API change, this doesn't work.
- #self.index.delete(uids)
- return True
- else:
- return False
-
- def index_tags(self, *tags, **kw):
- """
- Re-index global tag list.
- Removes all tags from index, then index them again.
- Indexed fields include: id, name (with and without polish stems), category
- """
- log.debug("Indexing tags")
- remove_only = kw.get('remove_only', False)
- # first, remove tags from index.
- if tags:
- tag_qs = []
- for tag in tags:
- q_id = self.index.Q(tag_id=tag.id)
-
- if isinstance(tag, PDCounterAuthor):
- q_cat = self.index.Q(tag_category='pd_author')
- elif isinstance(tag, PDCounterBook):
- q_cat = self.index.Q(tag_category='pd_book')
- else:
- q_cat = self.index.Q(tag_category=tag.category)
-
- q_id_cat = self.index.Q(q_id & q_cat)
- tag_qs.append(q_id_cat)
- self.delete_query(*tag_qs)
- else: # all
- q = self.index.Q(tag_id__any=True)
- self.delete_query(q)
-
- if not remove_only:
- # then add them [all or just one passed]
- if not tags:
- tags = chain(
- catalogue.models.Tag.objects.exclude(category='set'),
- PDCounterAuthor.objects.all(),
- PDCounterBook.objects.all())
-
- for tag in tags:
- if isinstance(tag, PDCounterAuthor):
- doc = {
- "tag_id": int(tag.id),
- "tag_name": tag.name,
- "tag_name_pl": tag.name,
- "tag_category": 'pd_author',
- "is_pdcounter": True,
- "uid": "tag%d_pd_a" % tag.id
- }
- elif isinstance(tag, PDCounterBook):
- doc = {
- "tag_id": int(tag.id),
- "tag_name": tag.title,
- "tag_name_pl": tag.title,
- "tag_category": 'pd_book',
- "is_pdcounter": True,
- "uid": "tag%d_pd_b" % tag.id
- }
- else:
- doc = {
- "tag_id": int(tag.id),
- "tag_name": tag.name,
- "tag_name_pl": tag.name,
- "tag_category": tag.category,
- "is_pdcounter": False,
- "uid": "tag%d" % tag.id
- }
- self.index.add(doc)
-
- def create_book_doc(self, book):
- """
- Create a lucene document referring book id.
- """
- doc = {'book_id': int(book.id)}
- if book.parent is not None:
- doc['parent_id'] = int(book.parent.id)
- return doc
-
- def remove_book(self, book, remove_snippets=True, legacy=True):
- """Removes a book from search index.
- book - Book instance."""
- if legacy:
- self.delete_query(self.index.Q(book_id=book.id))
-
- if remove_snippets:
- snippets = Snippets(book.id)
- snippets.remove()
- self.remove_snippets(book)
-
- def index_book(self, book, book_info=None, overwrite=True, legacy=True):
- """
- Indexes the book.
- Creates a lucene document for extracted metadata
- and calls self.index_content() to index the contents of the book.
- """
- if not book.xml_file: return
-
- if overwrite:
- # we don't remove snippets, since they might be still needed by
- # threads using not reopened index
- self.remove_book(book, remove_snippets=False, legacy=legacy)
-
- book_doc = self.create_book_doc(book)
- meta_fields = self.extract_metadata(book, book_info, dc_only=[
- 'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
- # let's not index it - it's only used for extracting publish date
- if 'source_name' in meta_fields:
- del meta_fields['source_name']
-
- for n, f in meta_fields.items():
- book_doc[n] = f
-
- book_doc['uid'] = "book%s" % book_doc['book_id']
- if legacy:
- self.index.add(book_doc)
- del book_doc
- book_fields = {
- 'title': meta_fields['title'],
- 'authors': meta_fields['authors'],
- 'published_date': meta_fields['published_date']
- }
-
- for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
- if tag_name in meta_fields:
- book_fields[tag_name] = meta_fields[tag_name]
-
- self.index_content(book, book_fields=book_fields, legacy=legacy)
-
master_tags = [
'opowiadanie',
'powiesc',
'uwaga', 'extra', 'nota_red', 'abstrakt',
'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
'didaskalia',
- 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
+ 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
]
footnote_tags = ['pa', 'pt', 'pr', 'pe']
skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
- published_date_re = re.compile("([0-9]+)[\]. ]*$")
-
- def extract_metadata(self, book, book_info=None, dc_only=None):
- """
- Extract metadata from book and returns a map of fields keyed by fieldname
- """
- fields = {}
-
- if book_info is None:
- book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
-
- fields['slug'] = book.slug
- fields['is_book'] = True
-
- # validator, name
- for field in dcparser.BookInfo.FIELDS:
- if dc_only and field.name not in dc_only:
- continue
- if hasattr(book_info, field.name):
- if not getattr(book_info, field.name):
- continue
- type_indicator = field.value_type
- if issubclass(type_indicator, librarian.meta.types.text.TextValue):
- s = getattr(book_info, field.name)
- if field.multiple:
- s = ', '.join(s)
- fields[field.name] = s
- elif issubclass(type_indicator, librarian.meta.types.person.Person):
- p = getattr(book_info, field.name)
- if isinstance(p, librarian.meta.types.person.Person):
- persons = str(p)
- else:
- persons = ', '.join(map(str, p))
- fields[field.name] = persons
-
- # get published date
- pd = None
- if hasattr(book_info, 'source_name') and book_info.source_name:
- match = self.published_date_re.search(book_info.source_name)
- if match is not None:
- pd = str(match.groups()[0])
- if not pd:
- pd = ""
- fields["published_date"] = pd
-
- return fields
-
- # def add_gaps(self, fields, fieldname):
- # """
- # Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
- # This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
- # """
- # def gap():
- # while True:
- # yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
- # return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
-
- def get_master(self, root):
+ @classmethod
+ def get_master(cls, root):
"""
Returns the first master tag from an etree.
"""
for master in root.iter():
- if master.tag in self.master_tags:
+ if master.tag in cls.master_tags:
return master
- def index_content(self, book, book_fields, legacy=True):
+ @staticmethod
+ def add_snippet(book, text, position):
+ book.snippet_set.create(
+ sec=position + 1,
+ text=text
+ )
+
+ @classmethod
+ def index_book(cls, book):
"""
Walks the book XML and extract content from it.
Adds parts for each header tag and for each fragment.
"""
+ if not book.xml_file: return
+
+ book.snippet_set.all().delete()
+
wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
root = wld.edoc.getroot()
- master = self.get_master(root)
+ master = cls.get_master(root)
if master is None:
return []
def walker(node):
- if node.tag not in self.ignore_content_tags:
+ if node.tag not in cls.ignore_content_tags:
yield node, None, None
if node.text is not None:
yield None, node.text, None
return
def fix_format(text):
- # separator = [" ", "\t", ".", ";", ","]
if isinstance(text, list):
- # need to join it first
text = filter(lambda s: s is not None, content)
text = ' '.join(text)
- # for i in range(len(text)):
- # if i > 0:
- # if text[i][0] not in separator\
- # and text[i - 1][-1] not in separator:
- # text.insert(i, " ")
return re.sub("(?m)/$", "", text)
- def add_part(snippets, **fields):
- doc = self.create_book_doc(book)
- for n, v in book_fields.items():
- doc[n] = v
-
- doc['header_index'] = fields["header_index"]
- doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
- doc['header_type'] = fields['header_type']
-
- doc['text'] = fields['text']
-
- # snippets
- snip_pos = snippets.add(fields["text"])
-
- doc['snippets_position'] = snip_pos[0]
- doc['snippets_length'] = snip_pos[1]
- if snippets.revision:
- doc["snippets_revision"] = snippets.revision
-
- if 'fragment_anchor' in fields:
- doc["fragment_anchor"] = fields['fragment_anchor']
-
- if 'themes' in fields:
- doc['themes'] = fields['themes']
- doc['uid'] = "part%s-%s-%s-%s" % (
- book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
- return doc
-
- fragments = {}
- snippets = Snippets(book.id).open('w')
- try:
- for header, position in zip(list(master), range(len(master))):
-
- if header.tag in self.skip_header_tags:
- continue
- if header.tag is etree.Comment:
- continue
-
- # section content
- content = []
- footnote = []
-
- def all_content(text):
- for frag in fragments.values():
- frag['text'].append(text)
- content.append(text)
- handle_text = [all_content]
-
- for start, text, end in walker(header):
- # handle footnotes
- if start is not None and start.tag in self.footnote_tags:
- footnote = []
-
- def collect_footnote(t):
- footnote.append(t)
-
- handle_text.append(collect_footnote)
- elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
- handle_text.pop()
- doc = add_part(snippets, header_index=position, header_type=header.tag,
- text=''.join(footnote))
- self.add_snippet(book, doc)
- if legacy:
- self.index.add(doc)
- footnote = []
-
- # handle fragments and themes.
- if start is not None and start.tag == 'begin':
- fid = start.attrib['id'][1:]
- fragments[fid] = {
- 'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
-
- # themes for this fragment
- elif start is not None and start.tag == 'motyw':
- fid = start.attrib['id'][1:]
- handle_text.append(lambda text: None)
- if start.text is not None:
- fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
- elif end is not None and end.tag == 'motyw':
- handle_text.pop()
-
- elif start is not None and start.tag == 'end':
- fid = start.attrib['id'][1:]
- if fid not in fragments:
- continue # a broken <end> node, skip it
- frag = fragments[fid]
- if not frag['themes']:
- continue # empty themes list.
- del fragments[fid]
-
- doc = add_part(snippets,
- header_type=frag['start_header'],
- header_index=frag['start_section'],
- header_span=position - frag['start_section'] + 1,
- fragment_anchor=fid,
- text=fix_format(frag['text']),
- themes=frag['themes'])
- # Add searchable fragment
- self.add_snippet(book, doc)
- if legacy:
- self.index.add(doc)
-
- # Collect content.
-
- if text is not None and handle_text is not []:
- hdl = handle_text[-1]
- hdl(text)
-
- # in the end, add a section text.
- doc = add_part(snippets, header_index=position,
- header_type=header.tag, text=fix_format(content))
-
- self.add_snippet(book, doc)
- if legacy:
- self.index.add(doc)
-
- finally:
- snippets.close()
-
- def remove_picture(self, picture_or_id):
- """Removes a picture from search index."""
- if isinstance(picture_or_id, picture.models.Picture):
- picture_id = picture_or_id.id
- else:
- picture_id = picture_or_id
- self.delete_query(self.index.Q(picture_id=picture_id))
-
- def index_picture(self, picture, picture_info=None, overwrite=True):
- """
- Indexes the picture.
- Creates a lucene document for extracted metadata
- and calls self.index_area() to index the contents of the picture.
- """
- if overwrite:
- # we don't remove snippets, since they might be still needed by
- # threads using not reopened index
- self.remove_picture(picture)
-
- picture_doc = {'picture_id': int(picture.id)}
- meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
- 'authors', 'title', 'epochs', 'kinds', 'genres'])
-
- picture_doc.update(meta_fields)
-
- picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
- self.index.add(picture_doc)
- del picture_doc['is_book']
- for area in picture.areas.all():
- self.index_area(area, picture_fields=picture_doc)
-
- def index_area(self, area, picture_fields):
- """
- Indexes themes and objects on the area.
- """
- doc = dict(picture_fields)
- doc['area_id'] = area.id
- doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
- doc['uid'] = 'area%s' % area.id
- self.index.add(doc)
-
-
-@total_ordering
-class SearchResult(object):
- def __init__(self, doc, how_found=None, query_terms=None):
- self.boost = 1.0
- self._hits = []
- self._processed_hits = None # processed hits
- self.snippets = []
- self.query_terms = query_terms
- self._book = None
-
- if 'score' in doc:
- self._score = doc['score']
- else:
- self._score = 0
-
- self.book_id = int(doc["book_id"])
-
- try:
- self.published_date = int(doc.get("published_date"))
- except ValueError:
- self.published_date = 0
-
- # content hits
- header_type = doc.get("header_type", None)
- # we have a content hit in some header of fragment
- if header_type is not None:
- sec = (header_type, int(doc["header_index"]))
- header_span = doc['header_span']
- header_span = header_span is not None and int(header_span) or 1
- fragment = doc.get("fragment_anchor", None)
- snippets_pos = (doc['snippets_position'], doc['snippets_length'])
- snippets_rev = doc.get('snippets_revision', None)
-
- hit = (sec + (header_span,), fragment, self._score, {
- 'how_found': how_found,
- 'snippets_pos': snippets_pos,
- 'snippets_revision': snippets_rev,
- 'themes': doc.get('themes', []),
- 'themes_pl': doc.get('themes_pl', [])
- })
-
- self._hits.append(hit)
-
- @classmethod
- def from_book(cls, book, how_found=None, query_terms=None):
- doc = {
- 'score': book.popularity.count,
- 'book_id': book.id,
- 'published_date': 0,
- }
- result = cls(doc, how_found=how_found, query_terms=query_terms)
- result._book = book
- return result
-
- def __str__(self):
- return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
- (self.book_id, len(self._hits),
- len(self._processed_hits) if self._processed_hits else -1,
- self._score, len(self.snippets))
-
- def __bytes__(self):
- return str(self).encode('utf-8')
-
- @property
- def score(self):
- return self._score * self.boost
-
- def merge(self, other):
- if self.book_id != other.book_id:
- raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
- self._hits += other._hits
- self._score += max(other._score, 0)
- return self
-
- def get_book(self):
- if self._book is not None:
- return self._book
- try:
- self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
- except catalogue.models.Book.DoesNotExist:
- self._book = None
- return self._book
-
- book = property(get_book)
-
- POSITION = 0
- FRAGMENT = 1
- POSITION_INDEX = 1
- POSITION_SPAN = 2
- SCORE = 2
- OTHER = 3
-
- @property
- def hits(self):
- if self._processed_hits is not None:
- return self._processed_hits
-
- # to sections and fragments
- frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
-
- sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
-
- # sections not covered by fragments
- sect = filter(lambda s: 0 == len(list(filter(
- lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
- f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
-
- def remove_duplicates(lst, keyfn, larger):
- els = {}
- for e in lst:
- eif = keyfn(e)
- if eif in els:
- if larger(els[eif], e):
- continue
- els[eif] = e
- return els.values()
-
- # remove fragments with duplicated fid's and duplicated snippets
- frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
-
- # remove duplicate sections
- sections = {}
-
- for s in sect:
- si = s[self.POSITION][self.POSITION_INDEX]
- # skip existing
- if si in sections:
- if sections[si]['score'] >= s[self.SCORE]:
- continue
-
- m = {'score': s[self.SCORE],
- 'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
- }
- m.update(s[self.OTHER])
- sections[si] = m
-
- hits = list(sections.values())
-
- for f in frags:
- try:
- frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
- except catalogue.models.Fragment.DoesNotExist:
- # stale index
+ for position, header in enumerate(master):
+ if header.tag in cls.skip_header_tags:
continue
- # Figure out if we were searching for a token matching some word in theme name.
- themes = frag.tags.filter(category='theme')
- themes_hit = set()
- if self.query_terms is not None:
- for i in range(0, len(f[self.OTHER]['themes'])):
- tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
- tms = map(str.lower, tms)
- for qt in self.query_terms:
- if qt in tms:
- themes_hit.add(f[self.OTHER]['themes'][i])
- break
-
- def theme_by_name(n):
- th = list(filter(lambda t: t.name == n, themes))
- if th:
- return th[0]
- else:
- return None
- themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
-
- m = {'score': f[self.SCORE],
- 'fragment': frag,
- 'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
- 'themes': themes,
- 'themes_hit': themes_hit
- }
- m.update(f[self.OTHER])
- hits.append(m)
-
- hits.sort(key=lambda h: h['score'], reverse=True)
-
- self._processed_hits = hits
-
- return hits
-
- @staticmethod
- def aggregate(*result_lists):
- books = {}
- for rl in result_lists:
- for r in rl:
- if r.book_id in books:
- books[r.book_id].merge(r)
- else:
- books[r.book_id] = r
- return books.values()
-
- def get_sort_key(self):
- return (-self.score,
- self.published_date,
- self.book.sort_key_author if self.book else '',
- self.book.sort_key if self.book else '')
-
- def __lt__(self, other):
- return self.get_sort_key() > other.get_sort_key()
-
- def __eq__(self, other):
- return self.get_sort_key() == other.get_sort_key()
-
- def __len__(self):
- return len(self.hits)
-
- def snippet_pos(self, idx=0):
- return self.hits[idx]['snippets_pos']
-
- def snippet_revision(self, idx=0):
- try:
- return self.hits[idx]['snippets_revision']
- except (IndexError, KeyError):
- return None
-
-
-@total_ordering
-class PictureResult(object):
- def __init__(self, doc, how_found=None, query_terms=None):
- self.boost = 1.0
- self.query_terms = query_terms
- self._picture = None
- self._hits = []
- self._processed_hits = None
-
- if 'score' in doc:
- self._score = doc['score']
- else:
- self._score = 0
-
- self.picture_id = int(doc["picture_id"])
-
- if doc.get('area_id'):
- hit = (self._score, {
- 'how_found': how_found,
- 'area_id': doc['area_id'],
- 'themes': doc.get('themes', []),
- 'themes_pl': doc.get('themes_pl', []),
- })
-
- self._hits.append(hit)
-
- def __str__(self):
- return "<PR id=%d score=%f >" % (self.picture_id, self._score)
-
- def __repr__(self):
- return str(self)
-
- @property
- def score(self):
- return self._score * self.boost
-
- def merge(self, other):
- if self.picture_id != other.picture_id:
- raise ValueError(
- "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
- self._hits += other._hits
- self._score += max(other._score, 0)
- return self
-
- SCORE = 0
- OTHER = 1
-
- @property
- def hits(self):
- if self._processed_hits is not None:
- return self._processed_hits
-
- hits = []
- for hit in self._hits:
- try:
- area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
- except picture.models.PictureArea.DoesNotExist:
- # stale index
+ if header.tag is etree.Comment:
continue
- # Figure out if we were searching for a token matching some word in theme name.
- themes_hit = set()
- if self.query_terms is not None:
- for i in range(0, len(hit[self.OTHER]['themes'])):
- tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
- tms = map(str.lower, tms)
- for qt in self.query_terms:
- if qt in tms:
- themes_hit.add(hit[self.OTHER]['themes'][i])
- break
- m = {
- 'score': hit[self.SCORE],
- 'area': area,
- 'themes_hit': themes_hit,
- }
- m.update(hit[self.OTHER])
- hits.append(m)
+ # section content
+ content = []
+ footnote = []
- hits.sort(key=lambda h: h['score'], reverse=True)
- hits = hits[:1]
- self._processed_hits = hits
- return hits
+ def all_content(text):
+ content.append(text)
+ handle_text = [all_content]
- def get_picture(self):
- if self._picture is None:
- self._picture = picture.models.Picture.objects.get(id=self.picture_id)
- return self._picture
-
- picture = property(get_picture)
-
- @staticmethod
- def aggregate(*result_lists):
- books = {}
- for rl in result_lists:
- for r in rl:
- if r.picture_id in books:
- books[r.picture_id].merge(r)
- else:
- books[r.picture_id] = r
- return books.values()
+ for start, text, end in walker(header):
+ # handle footnotes
+ if start is not None and start.tag in cls.footnote_tags:
+ footnote = []
- def __lt__(self, other):
- return self.score < other.score
+ def collect_footnote(t):
+ footnote.append(t)
- def __eq__(self, other):
- return self.score == other.score
-
-
-class Search(SolrIndex):
- """
- Search facilities.
- """
- def __init__(self, default_field="text"):
- super(Search, self).__init__(mode='r')
-
- def make_term_query(self, query, field='text', modal=operator.or_):
- """
- Returns term queries joined by boolean query.
- modal - applies to boolean query
- fuzzy - should the query by fuzzy.
- """
- if query is None:
- query = ''
- q = self.index.Q()
- q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
-
- return q
-
- def search_by_author(self, words):
- from catalogue.models import Book
- books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
- for word in words:
- books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
- return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
-
- def search_words(self, words, fields, required=None, book=True, picture=False):
- if book and not picture and fields == ['authors']:
- return self.search_by_author(words)
- filters = []
- for word in words:
- if book or picture or (word not in stopwords):
- word_filter = None
- for field in fields:
- q = self.index.Q(**{field: word})
- if word_filter is None:
- word_filter = q
- else:
- word_filter |= q
- filters.append(word_filter)
- if required:
- required_filter = None
- for field in required:
- for word in words:
- if book or picture or (word not in stopwords):
- q = self.index.Q(**{field: word})
- if required_filter is None:
- required_filter = q
- else:
- required_filter |= q
- filters.append(required_filter)
- if not filters:
- return []
- params = {}
- if book:
- params['is_book'] = True
- if picture:
- params['picture_id__gt'] = 0
- else:
- params['book_id__gt'] = 0
- query = self.index.query(**params)
- query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
- result_class = PictureResult if picture else SearchResult
- return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
-
- def get_snippets(self, searchresult, query, field='text', num=1):
- """
- Returns a snippet for found scoreDoc.
- """
- maxnum = len(searchresult)
- if num is None or num < 0 or num > maxnum:
- num = maxnum
- book_id = searchresult.book_id
- revision = searchresult.snippet_revision()
- snippets = Snippets(book_id, revision=revision)
- snips = [None] * maxnum
- try:
- snippets.open()
- idx = 0
- while idx < maxnum and num > 0:
- position, length = searchresult.snippet_pos(idx)
- if position is None or length is None:
- continue
- text = snippets.get((int(position),
- int(length)))
- snip = self.index.highlight(text=text, field=field, q=query)
- if not snip and field == 'text':
- snip = self.index.highlight(text=text, field='text_nonstem', q=query)
- if snip not in snips:
- snips[idx] = snip
- if snip:
- num -= 1
- idx += 1
-
- except IOError as e:
- book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
- if not book:
- log.error("Book does not exist for book id = %d" % book_id)
- elif not book.get().children.exists():
- log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
- return []
- finally:
- snippets.close()
-
- # remove verse end markers..
- snips = [s.replace("/\n", "\n") if s else s for s in snips]
-
- searchresult.snippets = snips
-
- return snips
-
- @staticmethod
- def apply_filters(query, filters):
- """
- Apply filters to a query
- """
- if filters is None:
- filters = []
- filters = filter(lambda x: x is not None, filters)
- for f in filters:
- query = query.query(f)
- return query
+ handle_text.append(collect_footnote)
+ elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
+ handle_text.pop()
+ cls.add_snippet(book, ''.join(footnote), position)
+ footnote = []
+ if text is not None and handle_text is not []:
+ hdl = handle_text[-1]
+ hdl(text)
-if getattr(settings, 'SEARCH_MOCK', False):
- from .mock_search import Search
+ # in the end, add a section text.
+ cls.add_snippet(book, fix_format(content), position)
+++ /dev/null
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
- """Ask a yes/no question via raw_input() and return their answer.
-
- "question" is a string that is presented to the user.
- "default" is the presumed answer if the user just hits <Enter>.
- It must be "yes" (the default), "no" or None (meaning
- an answer is required of the user).
-
- The "answer" return value is one of "yes" or "no".
- """
- valid = {"yes": True, "y": True, "ye": True,
- "no": False, "n": False}
- if default is None:
- prompt = " [y/n] "
- elif default == "yes":
- prompt = " [Y/n] "
- elif default == "no":
- prompt = " [y/N] "
- else:
- raise ValueError("invalid default answer: '%s'" % default)
-
- while True:
- sys.stdout.write(question + prompt)
- choice = raw_input().lower()
- if default is not None and choice == '':
- return valid[default]
- elif choice in valid:
- return valid[choice]
- else:
- sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
- help = 'Reindex everything.'
-
- def add_arguments(self, parser):
- parser.add_argument(
- '-n', '--book-id', action='store_true', dest='book_id',
- default=False, help='book id instead of slugs')
- parser.add_argument(
- '-t', '--just-tags', action='store_true', dest='just_tags',
- default=False, help='just reindex tags')
- parser.add_argument(
- '--start', dest='start_from', default=None,
- help='start from this slug')
- parser.add_argument(
- '--stop', dest='stop_after', default=None,
- help='stop after this slug')
- parser.add_argument('args', nargs='*', metavar='slug/id')
-
- def handle(self, **opts):
- from catalogue.models import Book
- from search.index import Index
- idx = Index()
-
- if not opts['just_tags']:
- if opts['args']:
- books = []
- for a in opts['args']:
- if opts['book_id']:
- books += Book.objects.filter(id=int(a)).all()
- else:
- books += Book.objects.filter(slug=a).all()
- else:
- books = list(Book.objects.order_by('slug'))
- start_from = opts.get('start_from')
- stop_after = opts.get('stop_after')
- if start_from:
- start_from = start_from.replace('-', '')
- if stop_after:
- stop_after = stop_after.replace('-', '')
- while books:
- try:
- b = books[0]
- slug = b.slug.replace('-', '')
- if stop_after and slug > stop_after:
- break
- if not start_from or slug >= start_from:
- print(b.slug)
- idx.index_book(b)
- idx.index.commit()
- books.pop(0)
- except:
- traceback.print_exc()
- try:
- # we might not be able to rollback
- idx.index.rollback()
- except:
- pass
- retry = query_yes_no("Retry?")
- if not retry:
- break
-
- print('Reindexing tags.')
- idx.index_tags()
- idx.index.commit()
+++ /dev/null
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-import sys
-import traceback
-
-from django.core.management.base import BaseCommand
-
-
-def query_yes_no(question, default="yes"):
- """Ask a yes/no question via raw_input() and return their answer.
-
- "question" is a string that is presented to the user.
- "default" is the presumed answer if the user just hits <Enter>.
- It must be "yes" (the default), "no" or None (meaning
- an answer is required of the user).
-
- The "answer" return value is one of "yes" or "no".
- """
- valid = {"yes": True, "y": True, "ye": True,
- "no": False, "n": False}
- if default is None:
- prompt = " [y/n] "
- elif default == "yes":
- prompt = " [Y/n] "
- elif default == "no":
- prompt = " [y/N] "
- else:
- raise ValueError("invalid default answer: '%s'" % default)
-
- while True:
- sys.stdout.write(question + prompt)
- choice = raw_input().lower()
- if default is not None and choice == '':
- return valid[default]
- elif choice in valid:
- return valid[choice]
- else:
- sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-
-
-class Command(BaseCommand):
- help = 'Reindex pictures.'
-
- def add_arguments(self, parser):
- self.add_argument(
- '-n', '--picture-id', action='store_true', dest='picture_id',
- default=False, help='picture id instead of slugs')
- self.add_argument('slug/id', nargs='*', metavar='slug/id')
-
- def handle(self, **opts):
- from picture.models import Picture
- from search.index import Index
- idx = Index()
-
- if opts['args']:
- pictures = []
- for a in opts['args']:
- if opts['picture_id']:
- pictures += Picture.objects.filter(id=int(a)).all()
- else:
- pictures += Picture.objects.filter(slug=a).all()
- else:
- pictures = list(Picture.objects.order_by('slug'))
- while pictures:
- try:
- p = pictures[0]
- print(p.slug)
- idx.index_picture(p)
- idx.index.commit()
- pictures.pop(0)
- except:
- traceback.print_exc()
- try:
- # we might not be able to rollback
- idx.index.rollback()
- except:
- pass
- retry = query_yes_no("Retry?")
- if not retry:
- break
+++ /dev/null
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from glob import glob
-from os import path
-from django.conf import settings
-from django.core.management.base import BaseCommand
-
-
-class Command(BaseCommand):
- help = 'Check snippets.'
-
- def handle(self, *args, **opts):
- sfn = glob(settings.SEARCH_INDEX+'snippets/*')
- for fn in sfn:
- print(fn)
- bkid = path.basename(fn)
- with open(fn) as f:
- cont = f.read()
- try:
- cont.decode('utf-8')
- except UnicodeDecodeError:
- print("error in snippets %s" % bkid)
+++ /dev/null
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from unittest.mock import Mock
-from catalogue.models import Book, Tag
-from random import randint, choice
-
-
-class Search(Mock):
- """
- Search mock for development without setting up Solr.
-
- Instead of connecting to an actual search server, it returns
- some random results for any query.
- """
- class MockIndex(Mock):
- def analyze(*args, **kwargs):
- return []
-
- index = MockIndex()
-
- def search_words(self, words, fields, required=None, book=True, picture=False):
- from .index import SearchResult
-
- max_results = 20
-
- if picture: return []
-
- qs = Book.objects.filter(findable=True).order_by('?')
- results = []
- for book in qs[:randint(1, max_results)]:
- doc = {
- 'score': randint(0, 100),
- 'book_id': book.pk,
- 'published_date': randint(1000, 1920),
- }
- res = SearchResult(doc, how_found='mock', query_terms=words)
- results.append(res)
- return results
-
# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
-from unittest import skipIf
from django.conf import settings
from django.test.utils import override_settings
from catalogue.test_utils import WLTestCase, get_fixture
-import tempfile
from catalogue.models import Book
-from search.index import Index, Search
import catalogue
import opds
-@override_settings(SEARCH_INDEX=tempfile.mkdtemp(prefix='djangotest_search_'))
-@skipIf(getattr(settings, 'NO_SEARCH_INDEX', False),
- 'Requires search server and NO_SEARCH_INDEX=False.')
class BookSearchTests(WLTestCase):
def setUp(self):
WLTestCase.setUp(self)
- index = Index()
- self.search = Search()
- index.delete_query(self.search.index.query(uid="*"))
- index.index.commit()
+ with override_settings(NO_SEARCH_INDEX=False):
+ self.do_doktora = Book.from_xml_file(
+ get_fixture('do-doktora.xml', opds))
+ self.do_anusie = Book.from_xml_file(
+ get_fixture('fraszka-do-anusie.xml', catalogue))
- self.do_doktora = Book.from_xml_file(
- get_fixture('do-doktora.xml', opds))
- self.do_anusie = Book.from_xml_file(
- get_fixture('fraszka-do-anusie.xml', catalogue))
-
- # TODO: Add slop option to sunburnt
- # def test_search_perfect_parts(self):
- # books = self.search.search_phrase("Jakoż hamować")
- # assert len(books) == 2
- # for b in books:
- # b.book_id == self.book.id
- # a = SearchResult.aggregate(books)
- # # just one fragment hit.
- # assert len(a[0].hits) == 1
+ def test_search_perfect_parts(self):
+ response = self.client.get('/szukaj/?q=Jakoż hamować')
+ res = response.context['results']
+ self.assertEqual(len(res['snippet']), 1)
+ for b, s in res['snippet'].items():
+ self.assertEqual(b.id, self.do_anusie.id)
urlpatterns = [
- path('', views.main, name='wlsearch'),
+ path('', views.search, name='wlsearch'),
path('hint/', views.hint, name='search_hint'),
]
+from django.conf import settings
from django.db.models import Func
from django.contrib.postgres.search import SearchQuery, SearchVectorField
'''
def as_sql(self, *args, **kwargs):
sql, params = super().as_sql(*args, **kwargs)
- sql = f'unaccent({sql}::text)::tsquery'
+ if settings.SEARCH_USE_UNACCENT:
+ sql = f'unaccent({sql}::text)::tsquery'
return sql, params
But user enters 'roze' -> stem leaves it as is, so we need original form in the vector.
'''
function='to_tsvector'
- template = '''unaccent(
- %(function)s('polish', %(expressions)s)::text)::tsvector ||
- to_tsvector(
- 'polish_simple',
- unaccent(%(expressions)s)
- )'''
+ if settings.SEARCH_USE_UNACCENT:
+ template = f'''unaccent(
+ %(function)s('{settings.SEARCH_CONFIG}', %(expressions)s)::text)::tsvector ||
+ to_tsvector(
+ '{settings.SEARCH_CONFIG_SIMPLE}',
+ unaccent(%(expressions)s)
+ )'''
output_field = SearchVectorField()
# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
#
from django.conf import settings
-from django.http.response import HttpResponseRedirect
from django.shortcuts import render
from django.views.decorators import cache
from django.http import HttpResponse, JsonResponse
from catalogue.models import Book, Tag
-from pdcounter.models import Author
-from picture.models import Picture
-from search.index import Search, SearchResult, PictureResult
from .forms import SearchFilters
-from suggest.forms import PublishingSuggestForm
import re
import json
from wolnelektury.utils import re_escape
-def match_word_re(word):
- if 'sqlite' in settings.DATABASES['default']['ENGINE']:
- return r"\b%s\b" % word
- elif 'mysql' in settings.DATABASES['default']['ENGINE']:
- return "[[:<:]]%s[[:>:]]" % word
-
-
query_syntax_chars = re.compile(r"[\\/*:(){}?.[\]+]")
return query_syntax_chars.sub(replace, query)
-def did_you_mean(query, tokens):
- return query
- # change = {}
- # for t in tokens:
- # authors = Tag.objects.filter(category='author', name__iregex=match_word_re(t))
- # if len(authors) > 0:
- # continue
-
- # if False:
- # if not dictionary.check(t):
- # try:
- # change_to = dictionary.suggest(t)[0].lower()
- # if change_to != t.lower():
- # change[t] = change_to
- # except IndexError:
- # pass
-
- # if change == {}:
- # return None
-
- # for frm, to in change.items():
- # query = query.replace(frm, to)
-
- # return query
-
-
@cache.never_cache
def hint(request, mozhint=False, param='term'):
prefix = request.GET.get(param, '')
ctx['hasresults'] = True
break
return render(request, 'search/results.html', ctx)
-
-
-@cache.never_cache
-def main(request):
- if request.EXPERIMENTS['layout'].value:
- return search(request)
-
- query = request.GET.get('q', '')
-
- format = request.GET.get('format')
- lang = request.GET.get('lang')
- epoch = request.GET.get('epoch')
- kind = request.GET.get('kind')
- genre = request.GET.get('genre')
-
- if len(query) < 2:
- return render(
- request, 'catalogue/search_too_short.html',
- {'prefix': query})
- elif len(query) > 256:
- return render(
- request, 'catalogue/search_too_long.html',
- {'prefix': query})
-
- query = prepare_query(query)
- if not (format or lang or epoch or kind or genre):
- pd_authors = search_pd_authors(query)
- else:
- pd_authors = []
- if not format or format != 'obraz':
- books = search_books(
- query,
- lang=lang,
- only_audio=format=='audio',
- only_synchro=format=='synchro',
- epoch=epoch,
- kind=kind,
- genre=genre
- )
- else:
- books = []
- if (not format or format == 'obraz') and not lang:
- pictures = search_pictures(
- query,
- epoch=epoch,
- kind=kind,
- genre=genre
- )
- else:
- pictures = []
-
- suggestion = ''
-
- if not (books or pictures or pd_authors):
- form = PublishingSuggestForm(initial={"books": query + ", "})
- return render(
- request,
- 'catalogue/search_no_hits.html',
- {
- 'form': form,
- 'did_you_mean': suggestion
- })
-
- if not (books or pictures) and len(pd_authors) == 1:
- return HttpResponseRedirect(pd_authors[0].get_absolute_url())
-
- return render(
- request,
- 'catalogue/search_multiple_hits.html',
- {
- 'pd_authors': pd_authors,
- 'books': books,
- 'pictures': pictures,
- 'did_you_mean': suggestion,
- 'set': {
- 'lang': lang,
- 'format': format,
- 'epoch': epoch,
- 'kind': kind,
- 'genre': genre,
- },
- 'tags': {
- 'epoch': Tag.objects.filter(category='epoch', for_books=True),
- 'genre': Tag.objects.filter(category='genre', for_books=True),
- 'kind': Tag.objects.filter(category='kind', for_books=True),
- },
- })
-
-def search_books(query, lang=None, only_audio=False, only_synchro=False, epoch=None, kind=None, genre=None):
- search = Search()
- results_parts = []
- search_fields = []
- words = query.split()
- fieldsets = (
- (['authors', 'authors_nonstem'], True),
- (['title', 'title_nonstem'], True),
- (['metadata', 'metadata_nonstem'], True),
- (['text', 'text_nonstem', 'themes_pl', 'themes_pl_nonstem'], False),
- )
- for fields, is_book in fieldsets:
- search_fields += fields
- results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book))
- results = []
- ids_results = {}
- for results_part in results_parts:
- for result in sorted(SearchResult.aggregate(results_part), reverse=True):
- book_id = result.book_id
- if book_id in ids_results:
- ids_results[book_id].merge(result)
- else:
- results.append(result)
- ids_results[book_id] = result
- descendant_ids = set(
- Book.objects.filter(id__in=ids_results, ancestor__in=ids_results).values_list('id', flat=True))
- results = [result for result in results if result.book_id not in descendant_ids]
- for result in results:
- search.get_snippets(result, query, num=3)
-
- def ensure_exists(r):
- try:
- if not r.book:
- return False
- except Book.DoesNotExist:
- return False
-
- if lang and r.book.language != lang:
- return False
- if only_audio and not r.book.has_mp3_file():
- return False
- if only_synchro and not r.book.has_daisy_file():
- return False
- if epoch and not r.book.tags.filter(category='epoch', slug=epoch).exists():
- return False
- if kind and not r.book.tags.filter(category='kind', slug=kind).exists():
- return False
- if genre and not r.book.tags.filter(category='genre', slug=genre).exists():
- return False
-
- return True
-
- results = [r for r in results if ensure_exists(r)]
- return results
-
-
-def search_pictures(query, epoch=None, kind=None, genre=None):
- search = Search()
- results_parts = []
- search_fields = []
- words = query.split()
- fieldsets = (
- (['authors', 'authors_nonstem'], True),
- (['title', 'title_nonstem'], True),
- (['metadata', 'metadata_nonstem'], True),
- (['themes_pl', 'themes_pl_nonstem'], False),
- )
- for fields, is_book in fieldsets:
- search_fields += fields
- results_parts.append(search.search_words(words, search_fields, required=fields, book=is_book, picture=True))
- results = []
- ids_results = {}
- for results_part in results_parts:
- for result in sorted(PictureResult.aggregate(results_part), reverse=True):
- picture_id = result.picture_id
- if picture_id in ids_results:
- ids_results[picture_id].merge(result)
- else:
- results.append(result)
- ids_results[picture_id] = result
-
- def ensure_exists(r):
- try:
- if not r.picture:
- return False
- except Picture.DoesNotExist:
- return False
-
- if epoch and not r.picture.tags.filter(category='epoch', slug=epoch).exists():
- return False
- if kind and not r.picture.tags.filter(category='kind', slug=kind).exists():
- return False
- if genre and not r.picture.tags.filter(category='genre', slug=genre).exists():
- return False
-
- return True
-
- results = [r for r in results if ensure_exists(r)]
- return results
-
-
-def search_pd_authors(query):
- pd_authors = Author.objects.filter(name__icontains=query)
- existing_slugs = Tag.objects.filter(
- category='author', slug__in=list(pd_authors.values_list('slug', flat=True))) \
- .values_list('slug', flat=True)
- pd_authors = pd_authors.exclude(slug__in=existing_slugs)
- return pd_authors
-
-
-def prepare_query(query):
- query = ' '.join(query.split())
- # filter out private use characters
- import unicodedata
- query = ''.join(ch for ch in query if unicodedata.category(ch) != 'Co')
- query = remove_query_syntax_chars(query)
-
- words = query.split()
- if len(words) > 10:
- query = ' '.join(words[:10])
- return query
CELERY_TASK_ALWAYS_EAGER = True
-# If SEARCH_INDEX not configured, disable the search.
-try:
- SOLR
-except NameError:
- NO_SEARCH_INDEX = True
-else:
- NO_SEARCH_INDEX = False
-
-
try:
SENTRY_DSN
except NameError:
DEFAULT_AUTO_FIELD = 'django.db.models.AutoField'
-SOLR_TEST = "http://localhost:8983/solr/wl_test/"
-SOLR_STOPWORDS = "/path/to/solr/data/conf/lang/stopwords_pl.txt"
-
# Local time zone for this installation. Choices can be found here:
# http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
# although not all choices may be available on all operating systems.
EXPERIMENTS_LAYOUT = 1
EXPERIMENTS_SOWKA = 0
-EXPERIMENTS_SEARCH = 0
WIDGETS = {}
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False
# Example: "/home/media/media.lawrence.com/"
MEDIA_ROOT = path.join(VAR_DIR, 'media/')
STATIC_ROOT = path.join(VAR_DIR, 'static/')
-SEARCH_INDEX = path.join(VAR_DIR, 'search_index/')
# URL that handles the media served from MEDIA_ROOT. Make sure to use a
# trailing slash if there is a path component (optional in other cases).
THUMBNAIL_BACKEND = 'wolnelektury.test_utils.DummyThumbnailBackend'
CATALOGUE_GET_MP3_LENGTH = 'catalogue.test_utils.get_mp3_length'
MEDIA_URL = '/media/'
+
+SEARCH_CONFIG = 'english'
+SEARCH_CONFIG_SIMPLE = 'simple'
+SEARCH_USE_UNACCENT = False