X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/a169ba7caa0d9d83fddcbad5ed05f536e0ca1b9e..98062d2158ebe1f734d811691ab15e6887684281:/src/search/index.py diff --git a/src/search/index.py b/src/search/index.py index bd31a2acf..22c9a02ae 100644 --- a/src/search/index.py +++ b/src/search/index.py @@ -9,6 +9,9 @@ import os import re from django.conf import settings from librarian import dcparser +import librarian.meta.types.date +import librarian.meta.types.person +import librarian.meta.types.text from librarian.parser import WLDocument from lxml import etree import scorched @@ -95,7 +98,10 @@ class Snippets(object): of the snippet stored there. """ self.file.seek(pos[0], 0) - txt = self.file.read(pos[1]).decode('utf-8') + try: + txt = self.file.read(pos[1]).decode('utf-8') + except: + return '' return txt def close(self): @@ -122,6 +128,22 @@ class Index(SolrIndex): def __init__(self): super(Index, self).__init__(mode='rw') + def remove_snippets(self, book): + book.snippet_set.all().delete() + + def add_snippet(self, book, doc): + assert book.id == doc.pop('book_id') + # Fragments already exist and can be indexed where they live. + if 'fragment_anchor' in doc: + return + + text = doc.pop('text') + header_index = doc.pop('header_index') + book.snippet_set.create( + sec=header_index, + text=text, + ) + def delete_query(self, *queries): """ index.delete(queries=...) doesn't work, so let's reimplement it @@ -223,30 +245,29 @@ class Index(SolrIndex): doc['parent_id'] = int(book.parent.id) return doc - def remove_book(self, book_or_id, remove_snippets=True): + def remove_book(self, book, remove_snippets=True, legacy=True): """Removes a book from search index. book - Book instance.""" - if isinstance(book_or_id, catalogue.models.Book): - book_id = book_or_id.id - else: - book_id = book_or_id - - self.delete_query(self.index.Q(book_id=book_id)) + if legacy: + self.delete_query(self.index.Q(book_id=book.id)) - if remove_snippets: - snippets = Snippets(book_id) + if remove_snippets: + snippets = Snippets(book.id) snippets.remove() + self.remove_snippets(book) - def index_book(self, book, book_info=None, overwrite=True): + def index_book(self, book, book_info=None, overwrite=True, legacy=True): """ Indexes the book. Creates a lucene document for extracted metadata and calls self.index_content() to index the contents of the book. """ + if not book.xml_file: return + if overwrite: # we don't remove snippets, since they might be still needed by # threads using not reopened index - self.remove_book(book, remove_snippets=False) + self.remove_book(book, remove_snippets=False, legacy=legacy) book_doc = self.create_book_doc(book) meta_fields = self.extract_metadata(book, book_info, dc_only=[ @@ -259,7 +280,8 @@ class Index(SolrIndex): book_doc[n] = f book_doc['uid'] = "book%s" % book_doc['book_id'] - self.index.add(book_doc) + if legacy: + self.index.add(book_doc) del book_doc book_fields = { 'title': meta_fields['title'], @@ -271,7 +293,7 @@ class Index(SolrIndex): if tag_name in meta_fields: book_fields[tag_name] = meta_fields[tag_name] - self.index_content(book, book_fields=book_fields) + self.index_content(book, book_fields=book_fields, legacy=legacy) master_tags = [ 'opowiadanie', @@ -303,7 +325,7 @@ class Index(SolrIndex): fields = {} if book_info is None: - book_info = dcparser.parse(open(book.xml_file.path)) + book_info = dcparser.parse(open(book.xml_file.path, 'rb')) fields['slug'] = book.slug fields['is_book'] = True @@ -315,21 +337,20 @@ class Index(SolrIndex): if hasattr(book_info, field.name): if not getattr(book_info, field.name): continue - # since no type information is available, we use validator - type_indicator = field.validator - if type_indicator == dcparser.as_unicode: + type_indicator = field.value_type + if issubclass(type_indicator, librarian.meta.types.text.TextValue): s = getattr(book_info, field.name) if field.multiple: s = ', '.join(s) fields[field.name] = s - elif type_indicator == dcparser.as_person: + elif issubclass(type_indicator, librarian.meta.types.person.Person): p = getattr(book_info, field.name) - if isinstance(p, dcparser.Person): + if isinstance(p, librarian.meta.types.person.Person): persons = str(p) else: persons = ', '.join(map(str, p)) fields[field.name] = persons - elif type_indicator == dcparser.as_date: + elif issubclass(type_indicator, librarian.meta.types.date.DateValue): dt = getattr(book_info, field.name) fields[field.name] = dt @@ -363,7 +384,7 @@ class Index(SolrIndex): if master.tag in self.master_tags: return master - def index_content(self, book, book_fields): + def index_content(self, book, book_fields, legacy=True): """ Walks the book XML and extract content from it. Adds parts for each header tag and for each fragment. @@ -390,16 +411,16 @@ class Index(SolrIndex): return def fix_format(text): - # separator = [u" ", u"\t", u".", u";", u","] + # separator = [" ", "\t", ".", ";", ","] if isinstance(text, list): # need to join it first text = filter(lambda s: s is not None, content) - text = u' '.join(text) + text = ' '.join(text) # for i in range(len(text)): # if i > 0: # if text[i][0] not in separator\ # and text[i - 1][-1] not in separator: - # text.insert(i, u" ") + # text.insert(i, " ") return re.sub("(?m)/$", "", text) @@ -463,9 +484,10 @@ class Index(SolrIndex): elif end is not None and footnote is not [] and end.tag in self.footnote_tags: handle_text.pop() doc = add_part(snippets, header_index=position, header_type=header.tag, - text=u''.join(footnote), - is_footnote=True) - self.index.add(doc) + text=''.join(footnote)) + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) footnote = [] # handle fragments and themes. @@ -499,7 +521,10 @@ class Index(SolrIndex): fragment_anchor=fid, text=fix_format(frag['text']), themes=frag['themes']) - self.index.add(doc) + # Add searchable fragment + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) # Collect content. @@ -511,7 +536,9 @@ class Index(SolrIndex): doc = add_part(snippets, header_index=position, header_type=header.tag, text=fix_format(content)) - self.index.add(doc) + self.add_snippet(book, doc) + if legacy: + self.index.add(doc) finally: snippets.close() @@ -613,7 +640,7 @@ class SearchResult(object): return result def __str__(self): - return u"" % \ + return "" % \ (self.book_id, len(self._hits), len(self._processed_hits) if self._processed_hits else -1, self._score, len(self.snippets)) @@ -636,7 +663,7 @@ class SearchResult(object): if self._book is not None: return self._book try: - self._book = catalogue.models.Book.objects.get(id=self.book_id) + self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True) except catalogue.models.Book.DoesNotExist: self._book = None return self._book @@ -665,20 +692,18 @@ class SearchResult(object): lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect) - def remove_duplicates(lst, keyfn, compare): + def remove_duplicates(lst, keyfn, larger): els = {} for e in lst: eif = keyfn(e) if eif in els: - if compare(els[eif], e) >= 1: + if larger(els[eif], e): continue els[eif] = e return els.values() # remove fragments with duplicated fid's and duplicated snippets - frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE])) - # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT], - # lambda a, b: cmp(a[SCORE], b[SCORE])) + frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE]) # remove duplicate sections sections = {} @@ -802,7 +827,7 @@ class PictureResult(object): self._hits.append(hit) def __str__(self): - return u"" % (self.picture_id, self._score) + return "" % (self.picture_id, self._score) def __repr__(self): return str(self) @@ -905,7 +930,7 @@ class Search(SolrIndex): def search_by_author(self, words): from catalogue.models import Book - books = Book.objects.filter(parent=None).order_by('-popularity__count') + books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count') for word in words: books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count') return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]] @@ -979,7 +1004,7 @@ class Search(SolrIndex): idx += 1 except IOError as e: - book = catalogue.models.Book.objects.filter(id=book_id) + book = catalogue.models.Book.objects.filter(id=book_id, findable=True) if not book: log.error("Book does not exist for book id = %d" % book_id) elif not book.get().children.exists():