1 # This file is part of FNP-Redakcja, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 from copy import deepcopy
8 from .constants import TRIM_BEGIN, TRIM_END, MASTERS
10 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
11 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
14 class ParseError(BaseException):
18 def _trim(text, trim_begin=True, trim_end=True):
20 Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
21 that eg. one big XML file can be compiled from many small XML files.
24 parts = RE_TRIM_BEGIN.split(text, maxsplit=1)
27 lines = parts[0].count('\n')
28 text = f'<!--TRIM:{lines}-->' + text
30 text = RE_TRIM_END.split(text, maxsplit=1)[0]
34 def compile_text(parts):
36 Compiles full text from an iterable of parts,
37 trimming where applicable.
42 for next_text in parts:
46 # trim the end, because there's more non-empty text
47 # don't trim beginning, if `text' is the first non-empty part
48 texts.append(_trim(text, trim_begin=trim_begin))
51 # don't trim the end, because there's no more text coming after `text'
52 # only trim beginning if it's not still the first non-empty
53 texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
57 def add_trim_begin(text):
58 trim_tag = etree.Comment(TRIM_BEGIN)
59 e = etree.fromstring(text)
60 for master in e[::-1]:
61 if master.tag in MASTERS:
63 if master.tag not in MASTERS:
64 raise ParseError('No master tag found!')
66 master.insert(0, trim_tag)
67 trim_tag.tail = '\n\n\n' + (master.text or '')
69 return str(etree.tostring(e, encoding="utf-8"), 'utf-8')
72 def add_trim_end(text):
73 trim_tag = etree.Comment(TRIM_END)
74 e = etree.fromstring(text)
75 for master in e[::-1]:
76 if master.tag in MASTERS:
78 if master.tag not in MASTERS:
79 raise ParseError('No master tag found!')
81 master.append(trim_tag)
83 prev = trim_tag.getprevious()
85 prev.tail = (prev.tail or '') + '\n\n\n'
87 master.text = (master.text or '') + '\n\n\n'
88 return str(etree.tostring(e, encoding="utf-8"), 'utf-8')
92 """Splits text into chapters.
94 All this stuff really must go somewhere else.
97 src = etree.fromstring(text)
100 splitter = u'naglowek_rozdzial'
101 parts = src.findall('.//naglowek_rozdzial')
104 copied = deepcopy(src)
108 # find the chapter's title
109 name_elem = deepcopy(element)
110 for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
111 for a in name_elem.findall('.//' + tag):
114 name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
116 # in the original, remove everything from the start of the last chapter
117 parent = element.getparent()
118 del parent[parent.index(element):]
119 element, parent = parent, parent.getparent()
120 while parent is not None:
121 del parent[parent.index(element) + 1:]
122 element, parent = parent, parent.getparent()
124 # in the copy, remove everything before the last chapter
125 element = copied.findall('.//naglowek_rozdzial')[-1]
126 parent = element.getparent()
127 while parent is not None:
129 while parent[0] is not element:
131 element, parent = parent, parent.getparent()
133 str(etree.tostring(copied, encoding='utf-8'), 'utf-8')
136 parts = src.findall('.//naglowek_rozdzial')
138 chunks[:0] = [[u'początek',
139 str(etree.tostring(src, encoding='utf-8'), 'utf-8')
142 for ch in chunks[1:]:
143 ch[1] = add_trim_begin(ch[1])
144 for ch in chunks[:-1]:
145 ch[1] = add_trim_end(ch[1])