src/search/index.py

   1 # This file is part of Wolne Lektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
   3 #
   4 import re
   5 from librarian.elements.base import WLElement
   6 from librarian.document import WLDocument
   7 from lxml import etree
   8
   9
  10 class Index:
  11     """
  12     Class indexing books.
  13     """
  14     master_tags = [
  15         'opowiadanie',
  16         'powiesc',
  17         'dramat_wierszowany_l',
  18         'dramat_wierszowany_lp',
  19         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
  20         'wywiad',
  21     ]
  22
  23     ignore_content_tags = [
  24         'uwaga', 'extra', 'nota_red', 'abstrakt',
  25         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
  26         'didaskalia',
  27         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc', 'motyw'
  28     ]
  29
  30     footnote_tags = ['pa', 'pt', 'pr', 'pe']
  31
  32     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
  33                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
  34
  35     @staticmethod
  36     def add_snippet(book, text, position, anchor):
  37         book.snippet_set.create(
  38             sec=position + 1,
  39             text=text,
  40             anchor=anchor
  41         )
  42
  43     # TODO: The section links stuff won't work.
  44     @classmethod
  45     def index_book(cls, book):
  46         """
  47         Walks the book XML and extract content from it.
  48         Adds parts for each header tag and for each fragment.
  49         """
  50         if not book.xml_file: return
  51
  52         book.snippet_set.all().delete()
  53
  54         wld = WLDocument(filename=book.xml_file.path)
  55         wld.assign_ids()
  56
  57         master = wld.tree.getroot().master
  58         if master is None:
  59             return []
  60
  61         def get_indexable(element):
  62             for child in element:
  63                 if not isinstance(child, WLElement):
  64                     continue
  65                 if not child.attrib.get('_id'):
  66                     for e in get_indexable(child):
  67                         yield e
  68                 else:
  69                     yield child
  70
  71         def walker(node):
  72             if node.tag not in cls.ignore_content_tags:
  73                 yield node, None, None
  74                 if node.text is not None:
  75                     yield None, node.text, None
  76                 for child in list(node):
  77                     for b, t, e in walker(child):
  78                         yield b, t, e
  79                 yield None, None, node
  80
  81             if node.tail is not None:
  82                 yield None, node.tail, None
  83             return
  84
  85         def fix_format(text):
  86             if isinstance(text, list):
  87                 text = filter(lambda s: s is not None, content)
  88                 text = ' '.join(text)
  89
  90             return re.sub("(?m)/$", "", text)
  91
  92         for position, header in enumerate(get_indexable(master)):
  93             if header.tag in cls.skip_header_tags:
  94                 continue
  95             if header.tag is etree.Comment:
  96                 continue
  97
  98             el_id = header.attrib['_id']
  99
 100             # section content
 101             content = []
 102             footnote = []
 103
 104             def all_content(text):
 105                 content.append(text)
 106             handle_text = [all_content]
 107
 108             for start, text, end in walker(header):
 109                 # handle footnotes
 110                 if start is not None and start.tag in cls.footnote_tags:
 111                     footnote = []
 112
 113                     def collect_footnote(t):
 114                         footnote.append(t)
 115
 116                     handle_text.append(collect_footnote)
 117                 elif end is not None and footnote is not [] and end.tag in cls.footnote_tags:
 118                     handle_text.pop()
 119                     cls.add_snippet(book, ''.join(footnote), position, el_id)
 120                     footnote = []
 121
 122                 if text is not None and handle_text is not []:
 123                     hdl = handle_text[-1]
 124                     hdl(text)
 125
 126             # in the end, add a section text.
 127             cls.add_snippet(book, fix_format(content), position, el_id)