X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/13f2f447ee06e5a95b86ee6d027b17d209518fdd..0a7090f11131631647db366ff87976407e788412:/src/search/index.py diff --git a/src/search/index.py b/src/search/index.py index 2be60fdf7..9784d49a7 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -9,6 +9,9 @@ import os import re from django.conf import settings from librarian import dcparser +import librarian.meta.types.date +import librarian.meta.types.person +import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree import scorched @@ -125,6 +128,22 @@ class Index(SolrIndex): def __init__(self): super(Index, self).__init__(mode='rw') + def remove_snippets(self, book): + book.snippet_set.all().delete() + + def add_snippet(self, book, doc): + assert book.id == doc.pop('book_id') + # Fragments already exist and can be indexed where they live. + if 'fragment_anchor' in doc: + return + + text = doc.pop('text') + header_index = doc.pop('header_index') + book.snippet_set.create( + sec=header_index, + text=text, + ) + def delete_query(self, *queries): """ index.delete(queries=...) doesn't work, so let's reimplement it @@ -226,19 +245,15 @@ class Index(SolrIndex): doc['parent_id'] = int(book.parent.id) return doc - def remove_book(self, book_or_id, remove_snippets=True): + def remove_book(self, book, remove_snippets=True): """Removes a book from search index. book - Book instance.""" - if isinstance(book_or_id, catalogue.models.Book): - book_id = book_or_id.id - else: - book_id = book_or_id - - self.delete_query(self.index.Q(book_id=book_id)) + self.delete_query(self.index.Q(book_id=book.id)) if remove_snippets: - snippets = Snippets(book_id) + snippets = Snippets(book.id) snippets.remove() + self.remove_snippets(book) def index_book(self, book, book_info=None, overwrite=True): """ @@ -246,6 +261,8 @@ class Index(SolrIndex): Creates a lucene document for extracted metadata and calls self.index_content() to index the contents of the book. """ + if not book.xml_file: return + if overwrite: # we don't remove snippets, since they might be still needed by # threads using not reopened index @@ -306,7 +323,7 @@ class Index(SolrIndex): fields = {} if book_info is None: - book_info = dcparser.parse(open(book.xml_file.path)) + book_info = dcparser.parse(open(book.xml_file.path, 'rb')) fields['slug'] = book.slug fields['is_book'] = True @@ -318,21 +335,20 @@ class Index(SolrIndex): if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue - # since no type information is available, we use validator - type_indicator = field.validator - if type_indicator == dcparser.as_unicode: + type_indicator = field.value_type + if issubclass(type_indicator, librarian.meta.types.text.TextValue): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) fields[field.name] = s - elif type_indicator == dcparser.as_person: + elif issubclass(type_indicator, librarian.meta.types.person.Person): p = getattr(book_info, field.name) - if isinstance(p, dcparser.Person): + if isinstance(p, librarian.meta.types.person.Person): persons = str(p) else: persons = ', '.join(map(str, p)) fields[field.name] = persons - elif type_indicator == dcparser.as_date: + elif issubclass(type_indicator, librarian.meta.types.date.DateValue): dt = getattr(book_info, field.name) fields[field.name] = dt @@ -466,8 +482,8 @@ class Index(SolrIndex): elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - text=''.join(footnote), - is_footnote=True) + text=''.join(footnote)) + self.add_snippet(book, doc) self.index.add(doc) footnote = [] @@ -502,6 +518,8 @@ class Index(SolrIndex): fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) + # Add searchable fragment + self.add_snippet(book, doc) self.index.add(doc) # Collect content. @@ -514,6 +532,7 @@ class Index(SolrIndex): doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) + self.add_snippet(book, doc) self.index.add(doc) finally: