src/documents/xml_tools.py

   1 # This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from copy import deepcopy
   5 import re
   6
   7 from lxml import etree
   8 from .constants import TRIM_BEGIN, TRIM_END, MASTERS
   9
  10 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
  11 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
  12
  13
  14 class ParseError(BaseException):
  15     pass
  16
  17
  18 def _trim(text, trim_begin=True, trim_end=True):
  19     """
  20         Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
  21         that eg. one big XML file can be compiled from many small XML files.
  22     """
  23     if trim_begin:
  24         parts = RE_TRIM_BEGIN.split(text, maxsplit=1)
  25         text = parts[-1]
  26         if len(parts) > 1:
  27             lines = parts[0].count('\n')
  28             text = f'<!--TRIM:{lines}-->' + text
  29     if trim_end:
  30         text = RE_TRIM_END.split(text, maxsplit=1)[0]
  31     return text
  32
  33
  34 def compile_text(parts):
  35     """
  36         Compiles full text from an iterable of parts,
  37         trimming where applicable.
  38     """
  39     texts = []
  40     trim_begin = False
  41     text = ''
  42     for next_text in parts:
  43         if not next_text:
  44             continue
  45         if text:
  46             # trim the end, because there's more non-empty text
  47             # don't trim beginning, if `text' is the first non-empty part
  48             texts.append(_trim(text, trim_begin=trim_begin))
  49             trim_begin = True
  50         text = next_text
  51     # don't trim the end, because there's no more text coming after `text'
  52     # only trim beginning if it's not still the first non-empty
  53     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
  54     return "".join(texts)
  55
  56
  57 def add_trim_begin(text):
  58     trim_tag = etree.Comment(TRIM_BEGIN)
  59     e = etree.fromstring(text)
  60     for master in e[::-1]:
  61         if master.tag in MASTERS:
  62             break
  63     if master.tag not in MASTERS:
  64         raise ParseError('No master tag found!')
  65
  66     master.insert(0, trim_tag)
  67     trim_tag.tail = '\n\n\n' + (master.text or '')
  68     master.text = '\n'
  69     return str(etree.tostring(e, encoding="utf-8"), 'utf-8')
  70
  71
  72 def add_trim_end(text):
  73     trim_tag = etree.Comment(TRIM_END)
  74     e = etree.fromstring(text)
  75     for master in e[::-1]:
  76         if master.tag in MASTERS:
  77             break
  78     if master.tag not in MASTERS:
  79         raise ParseError('No master tag found!')
  80
  81     master.append(trim_tag)
  82     trim_tag.tail = '\n'
  83     prev = trim_tag.getprevious()
  84     if prev is not None:
  85         prev.tail = (prev.tail or '') + '\n\n\n'
  86     else:
  87         master.text = (master.text or '') + '\n\n\n'
  88     return str(etree.tostring(e, encoding="utf-8"), 'utf-8')
  89
  90
  91 def split_xml(text):
  92     """Splits text into chapters.
  93
  94     All this stuff really must go somewhere else.
  95
  96     """
  97     src = etree.fromstring(text)
  98     chunks = []
  99
 100     splitter = u'naglowek_rozdzial'
 101     parts = src.findall('.//naglowek_rozdzial')
 102     while parts:
 103         # copy the document
 104         copied = deepcopy(src)
 105
 106         element = parts[-1]
 107
 108         # find the chapter's title
 109         name_elem = deepcopy(element)
 110         for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
 111             for a in name_elem.findall('.//' + tag):
 112                 a.text=''
 113                 del a[:]
 114         name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
 115
 116         # in the original, remove everything from the start of the last chapter
 117         parent = element.getparent()
 118         del parent[parent.index(element):]
 119         element, parent = parent, parent.getparent()
 120         while parent is not None:
 121             del parent[parent.index(element) + 1:]
 122             element, parent = parent, parent.getparent()
 123
 124         # in the copy, remove everything before the last chapter
 125         element = copied.findall('.//naglowek_rozdzial')[-1]
 126         parent = element.getparent()
 127         while parent is not None:
 128             parent.text = None
 129             while parent[0] is not element:
 130                 del parent[0]
 131             element, parent = parent, parent.getparent()
 132         chunks[:0] = [[name,
 133             str(etree.tostring(copied, encoding='utf-8'), 'utf-8')
 134             ]]
 135
 136         parts = src.findall('.//naglowek_rozdzial')
 137
 138     chunks[:0] = [[u'początek',
 139         str(etree.tostring(src, encoding='utf-8'), 'utf-8')
 140         ]]
 141
 142     for ch in chunks[1:]:
 143         ch[1] = add_trim_begin(ch[1])
 144     for ch in chunks[:-1]:
 145         ch[1] = add_trim_end(ch[1])
 146
 147     return chunks