src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 import re
   5 from librarian.parser import WLDocument
   6 from lxml import etree
   7
   8
   9 class Index:
  10     """
  11     Class indexing books.
  12     """
  13     master_tags = [
  14         'opowiadanie',
  15         'powiesc',
  16         'dramat_wierszowany_l',
  17         'dramat_wierszowany_lp',
  18         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
  19         'wywiad',
  20     ]
  21
  22     ignore_content_tags = [
  23         'uwaga', 'extra', 'nota_red', 'abstrakt',
  24         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
  25         'didaskalia',
  26         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
  27     ]
  28
  29     footnote_tags = ['pa', 'pt', 'pr', 'pe']
  30
  31     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
  32                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  33
  34     @classmethod
  35     def get_master(cls, root):
  36         """
  37         Returns the first master tag from an etree.
  38         """
  39         for master in root.iter():
  40             if master.tag in cls.master_tags:
  41                 return master
  42
  43     @staticmethod
  44     def add_snippet(book, text, position):
  45         book.snippet_set.create(
  46             sec=position + 1,
  47             text=text
  48         )
  49
  50     @classmethod
  51     def index_book(cls, book):
  52         """
  53         Walks the book XML and extract content from it.
  54         Adds parts for each header tag and for each fragment.
  55         """
  56         if not book.xml_file: return
  57
  58         book.snippet_set.all().delete()
  59
  60         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
  61         root = wld.edoc.getroot()
  62
  63         master = cls.get_master(root)
  64         if master is None:
  65             return []
  66
  67         def walker(node):
  68             if node.tag not in cls.ignore_content_tags:
  69                 yield node, None, None
  70                 if node.text is not None:
  71                     yield None, node.text, None
  72                 for child in list(node):
  73                     for b, t, e in walker(child):
  74                         yield b, t, e
  75                 yield None, None, node
  76
  77             if node.tail is not None:
  78                 yield None, node.tail, None
  79             return
  80
  81         def fix_format(text):
  82             if isinstance(text, list):
  83                 text = filter(lambda s: s is not None, content)
  84                 text = ' '.join(text)
  85
  86             return re.sub("(?m)/$", "", text)
  87
  88         for position, header in enumerate(master):
  89             if header.tag in cls.skip_header_tags:
  90                 continue
  91             if header.tag is etree.Comment:
  92                 continue
  93
  94             # section content
  95             content = []
  96             footnote = []
  97
  98             def all_content(text):
  99                 content.append(text)
 100             handle_text = [all_content]
 101
 102             for start, text, end in walker(header):
 103                 # handle footnotes
 104                 if start is not None and start.tag in cls.footnote_tags:
 105                     footnote = []
 106
 107                     def collect_footnote(t):
 108                         footnote.append(t)
 109
 110                     handle_text.append(collect_footnote)
 111                 elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
 112                     handle_text.pop()
 113                     cls.add_snippet(book, ''.join(footnote), position)
 114                     footnote = []
 115
 116                 if text is not None and handle_text is not []:
 117                     hdl = handle_text[-1]
 118                     hdl(text)
 119
 120             # in the end, add a section text.
 121             cls.add_snippet(book, fix_format(content), position)