X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/40a993399c67ca16752f44b8c60c41b8e8df838f..24ef4f83129e718634e60e53a5c028e9fe3d7446:/apps/catalogue/xml_tools.py diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py index 7be05fd5..abab7f27 100644 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -2,6 +2,7 @@ from copy import deepcopy import re +from django.utils.encoding import force_str from lxml import etree from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS @@ -199,4 +200,41 @@ def wl2_to_wl1(wl2_xml, slug): if not h[0].text or not re.match(r'\d\.\s', h[0].text): raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) h[0].text = h[0].text[3:] - return etree.tostring(w1t, encoding='utf-8') \ No newline at end of file + return etree.tostring(w1t, encoding='utf-8') + + +EXCEPTIONS = [ + ('div', 'img'), + ('div', 'video'), + ('div', 'table.cell'), +] + + +def remove_element(element): + parent = element.getparent() + tail = element.tail + if tail: + prev = element.getprevious() + if prev is not None: + prev.tail = (prev.tail or '') + tail + else: + parent.text = (parent.text or '') + tail + parent.remove(element) + + +def remove_empty_elements(xml): + try: + tree = etree.fromstring(force_str(xml.replace(' ', u'\xa0'))) + except SyntaxError: + return None + changed = False + another_loop = True + while another_loop: + another_loop = False + for element in tree.findall('.//*'): + if (not element.text or not element.text.strip()) and len(element) == 0: + if (element.tag, element.attrib.get('class')) not in EXCEPTIONS: + remove_element(element) + changed = True + another_loop = True + return etree.tostring(tree, encoding=unicode) if changed else None