X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/fdd62169ba22c4c1be2f2306b5339eadd74ffb6d..f5cf23412d0e643e803a439b0aab0b676eb21a57:/apps/catalogue/xml_tools.py diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py old mode 100755 new mode 100644 index 928e57be..d6a9333b --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +from copy import deepcopy from functools import wraps import re @@ -139,10 +141,11 @@ def compile_text(parts): for next_text in parts: if not next_text: continue - # trim the end, because there's more non-empty text - # don't trim beginning, if `text' is the first non-empty part - texts.append(_trim(text, trim_begin=trim_begin)) - trim_begin = True + if text: + # trim the end, because there's more non-empty text + # don't trim beginning, if `text' is the first non-empty part + texts.append(_trim(text, trim_begin=trim_begin)) + trim_begin = True text = next_text # don't trim the end, because there's no more text coming after `text' # only trim beginning if it's not still the first non-empty @@ -156,7 +159,7 @@ def change_master(text, master): """ e = etree.fromstring(text) e[-1].tag = master - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def basic_structure(text, master): @@ -167,7 +170,7 @@ def basic_structure(text, master): ''' % (TRIM_BEGIN, TRIM_END)) e[0].tag = master e[0][0].tail = "\n"*3 + text + "\n"*3 - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def add_trim_begin(text): @@ -182,7 +185,7 @@ def add_trim_begin(text): master.insert(0, trim_tag) trim_tag.tail = '\n\n\n' + (master.text or '') master.text = '\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def add_trim_end(text): @@ -201,4 +204,63 @@ def add_trim_end(text): prev.tail = (prev.tail or '') + '\n\n\n' else: master.text = (master.text or '') + '\n\n\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') + + +def split_xml(text): + """Splits text into chapters. + + All this stuff really must go somewhere else. + + """ + src = etree.fromstring(text) + chunks = [] + + splitter = u'naglowek_rozdzial' + parts = src.findall('.//naglowek_rozdzial') + while parts: + # copy the document + copied = deepcopy(src) + + element = parts[-1] + + # find the chapter's title + name_elem = deepcopy(element) + for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga': + for a in name_elem.findall('.//' + tag): + a.text='' + del a[:] + name = etree.tostring(name_elem, method='text', encoding='utf-8').strip() + + # in the original, remove everything from the start of the last chapter + parent = element.getparent() + del parent[parent.index(element):] + element, parent = parent, parent.getparent() + while parent is not None: + del parent[parent.index(element) + 1:] + element, parent = parent, parent.getparent() + + # in the copy, remove everything before the last chapter + element = copied.findall('.//naglowek_rozdzial')[-1] + parent = element.getparent() + while parent is not None: + parent.text = None + while parent[0] is not element: + del parent[0] + element, parent = parent, parent.getparent() + chunks[:0] = [[name, + unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8') + ]] + + parts = src.findall('.//naglowek_rozdzial') + + chunks[:0] = [[u'początek', + unicode(etree.tostring(src, encoding='utf-8'), 'utf-8') + ]] + + for ch in chunks[1:]: + ch[1] = add_trim_begin(ch[1]) + for ch in chunks[:-1]: + ch[1] = add_trim_end(ch[1]) + + return chunks