fix

[wolnelektury.git] / src / search / index.py
diff --git a/src/search/index.py b/src/search/index.py

index 2d84cb4..22c9a02 100644 (file)
--- a/src/search/index.py
+++ b/src/search/index.py
@@ -9,6 +9,9 @@ import os
  import re
  from django.conf import settings
  from librarian import dcparser
  import re
  from django.conf import settings
  from librarian import dcparser
+import librarian.meta.types.date
+import librarian.meta.types.person
+import librarian.meta.types.text
  from librarian.parser import WLDocument
  from lxml import etree
  import scorched
  from librarian.parser import WLDocument
  from lxml import etree
  import scorched
@@ -95,7 +98,10 @@ class Snippets(object):
          of the snippet stored there.
          """
          self.file.seek(pos[0], 0)
          of the snippet stored there.
          """
          self.file.seek(pos[0], 0)
-        txt = self.file.read(pos[1]).decode('utf-8')
+        try:
+            txt = self.file.read(pos[1]).decode('utf-8')
+        except:
+            return ''
          return txt
  
      def close(self):
          return txt
  
      def close(self):
@@ -122,6 +128,22 @@ class Index(SolrIndex):
      def __init__(self):
          super(Index, self).__init__(mode='rw')
  
      def __init__(self):
          super(Index, self).__init__(mode='rw')
  
+    def remove_snippets(self, book):
+        book.snippet_set.all().delete()
+
+    def add_snippet(self, book, doc):
+        assert book.id == doc.pop('book_id')
+        # Fragments already exist and can be indexed where they live.
+        if 'fragment_anchor' in doc:
+            return
+
+        text = doc.pop('text')
+        header_index = doc.pop('header_index')
+        book.snippet_set.create(
+            sec=header_index,
+            text=text,
+        )
+
      def delete_query(self, *queries):
          """
          index.delete(queries=...) doesn't work, so let's reimplement it
      def delete_query(self, *queries):
          """
          index.delete(queries=...) doesn't work, so let's reimplement it
@@ -223,30 +245,29 @@ class Index(SolrIndex):
              doc['parent_id'] = int(book.parent.id)
          return doc
  
              doc['parent_id'] = int(book.parent.id)
          return doc
  
-    def remove_book(self, book_or_id, remove_snippets=True):
+    def remove_book(self, book, remove_snippets=True, legacy=True):
          """Removes a book from search index.
          book - Book instance."""
          """Removes a book from search index.
          book - Book instance."""
-        if isinstance(book_or_id, catalogue.models.Book):
-            book_id = book_or_id.id
-        else:
-            book_id = book_or_id
-
-        self.delete_query(self.index.Q(book_id=book_id))
+        if legacy:
+          self.delete_query(self.index.Q(book_id=book.id))
  
  
-        if remove_snippets:
-            snippets = Snippets(book_id)
+          if remove_snippets:
+            snippets = Snippets(book.id)
              snippets.remove()
              snippets.remove()
+        self.remove_snippets(book)
  
  
-    def index_book(self, book, book_info=None, overwrite=True):
+    def index_book(self, book, book_info=None, overwrite=True, legacy=True):
          """
          Indexes the book.
          Creates a lucene document for extracted metadata
          and calls self.index_content() to index the contents of the book.
          """
          """
          Indexes the book.
          Creates a lucene document for extracted metadata
          and calls self.index_content() to index the contents of the book.
          """
+        if not book.xml_file: return
+
          if overwrite:
              # we don't remove snippets, since they might be still needed by
              # threads using not reopened index
          if overwrite:
              # we don't remove snippets, since they might be still needed by
              # threads using not reopened index
-            self.remove_book(book, remove_snippets=False)
+            self.remove_book(book, remove_snippets=False, legacy=legacy)
  
          book_doc = self.create_book_doc(book)
          meta_fields = self.extract_metadata(book, book_info, dc_only=[
  
          book_doc = self.create_book_doc(book)
          meta_fields = self.extract_metadata(book, book_info, dc_only=[
@@ -259,7 +280,8 @@ class Index(SolrIndex):
              book_doc[n] = f
  
          book_doc['uid'] = "book%s" % book_doc['book_id']
              book_doc[n] = f
  
          book_doc['uid'] = "book%s" % book_doc['book_id']
-        self.index.add(book_doc)
+        if legacy:
+            self.index.add(book_doc)
          del book_doc
          book_fields = {
              'title': meta_fields['title'],
          del book_doc
          book_fields = {
              'title': meta_fields['title'],
@@ -271,7 +293,7 @@ class Index(SolrIndex):
              if tag_name in meta_fields:
                  book_fields[tag_name] = meta_fields[tag_name]
  
              if tag_name in meta_fields:
                  book_fields[tag_name] = meta_fields[tag_name]
  
-        self.index_content(book, book_fields=book_fields)
+        self.index_content(book, book_fields=book_fields, legacy=legacy)
  
      master_tags = [
          'opowiadanie',
  
      master_tags = [
          'opowiadanie',
@@ -303,7 +325,7 @@ class Index(SolrIndex):
          fields = {}
  
          if book_info is None:
          fields = {}
  
          if book_info is None:
-            book_info = dcparser.parse(open(book.xml_file.path))
+            book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
  
          fields['slug'] = book.slug
          fields['is_book'] = True
  
          fields['slug'] = book.slug
          fields['is_book'] = True
@@ -315,21 +337,20 @@ class Index(SolrIndex):
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
              if hasattr(book_info, field.name):
                  if not getattr(book_info, field.name):
                      continue
-                # since no type information is available, we use validator
-                type_indicator = field.validator
-                if type_indicator == dcparser.as_unicode:
+                type_indicator = field.value_type
+                if issubclass(type_indicator, librarian.meta.types.text.TextValue):
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
                      fields[field.name] = s
                      s = getattr(book_info, field.name)
                      if field.multiple:
                          s = ', '.join(s)
                      fields[field.name] = s
-                elif type_indicator == dcparser.as_person:
+                elif issubclass(type_indicator, librarian.meta.types.person.Person):
                      p = getattr(book_info, field.name)
                      p = getattr(book_info, field.name)
-                    if isinstance(p, dcparser.Person):
+                    if isinstance(p, librarian.meta.types.person.Person):
                          persons = str(p)
                      else:
                          persons = ', '.join(map(str, p))
                      fields[field.name] = persons
                          persons = str(p)
                      else:
                          persons = ', '.join(map(str, p))
                      fields[field.name] = persons
-                elif type_indicator == dcparser.as_date:
+                elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
                      dt = getattr(book_info, field.name)
                      fields[field.name] = dt
  
                      dt = getattr(book_info, field.name)
                      fields[field.name] = dt
  
@@ -363,7 +384,7 @@ class Index(SolrIndex):
              if master.tag in self.master_tags:
                  return master
  
              if master.tag in self.master_tags:
                  return master
  
-    def index_content(self, book, book_fields):
+    def index_content(self, book, book_fields, legacy=True):
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
          """
          Walks the book XML and extract content from it.
          Adds parts for each header tag and for each fragment.
@@ -463,9 +484,10 @@ class Index(SolrIndex):
                      elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                          handle_text.pop()
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
                      elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
                          handle_text.pop()
                          doc = add_part(snippets, header_index=position, header_type=header.tag,
-                                       text=''.join(footnote),
-                                       is_footnote=True)
-                        self.index.add(doc)
+                                       text=''.join(footnote))
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
                          footnote = []
  
                      # handle fragments and themes.
                          footnote = []
  
                      # handle fragments and themes.
@@ -499,7 +521,10 @@ class Index(SolrIndex):
                                         fragment_anchor=fid,
                                         text=fix_format(frag['text']),
                                         themes=frag['themes'])
                                         fragment_anchor=fid,
                                         text=fix_format(frag['text']),
                                         themes=frag['themes'])
-                        self.index.add(doc)
+                        # Add searchable fragment
+                        self.add_snippet(book, doc)
+                        if legacy:
+                            self.index.add(doc)
  
                          # Collect content.
  
  
                          # Collect content.
  
@@ -511,7 +536,9 @@ class Index(SolrIndex):
                  doc = add_part(snippets, header_index=position,
                                 header_type=header.tag, text=fix_format(content))
  
                  doc = add_part(snippets, header_index=position,
                                 header_type=header.tag, text=fix_format(content))
  
-                self.index.add(doc)
+                self.add_snippet(book, doc)
+                if legacy:
+                    self.index.add(doc)
  
          finally:
              snippets.close()
  
          finally:
              snippets.close()
@@ -636,7 +663,7 @@ class SearchResult(object):
          if self._book is not None:
              return self._book
          try:
          if self._book is not None:
              return self._book
          try:
-            self._book = catalogue.models.Book.objects.get(id=self.book_id)
+            self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
          except catalogue.models.Book.DoesNotExist:
              self._book = None
          return self._book
          except catalogue.models.Book.DoesNotExist:
              self._book = None
          return self._book
@@ -903,7 +930,7 @@ class Search(SolrIndex):
  
      def search_by_author(self, words):
          from catalogue.models import Book
  
      def search_by_author(self, words):
          from catalogue.models import Book
-        books = Book.objects.filter(parent=None).order_by('-popularity__count')
+        books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
          for word in words:
              books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
          return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
          for word in words:
              books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
          return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
@@ -977,7 +1004,7 @@ class Search(SolrIndex):
                  idx += 1
  
          except IOError as e:
                  idx += 1
  
          except IOError as e:
-            book = catalogue.models.Book.objects.filter(id=book_id)
+            book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
              if not book:
                  log.error("Book does not exist for book id = %d" % book_id)
              elif not book.get().children.exists():
              if not book:
                  log.error("Book does not exist for book id = %d" % book_id)
              elif not book.get().children.exists():