X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/2f9c60b76f3ab4e69d794a6bb14388a81ff29eb7..a802a933f53625f732e9d3807b4162e770c8aaf1:/apps/catalogue/xml_tools.py?ds=inline diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py old mode 100755 new mode 100644 index 928e57be..abab7f27 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -1,6 +1,8 @@ -from functools import wraps +# -*- coding: utf-8 -*- +from copy import deepcopy import re +from django.utils.encoding import force_str from lxml import etree from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS @@ -12,112 +14,8 @@ class ParseError(BaseException): pass -def obj_memoized(f): - """ - A decorator that caches return value of object methods. - The cache is kept with the object, in a _obj_memoized property. - """ - @wraps(f) - def wrapper(self, *args, **kwargs): - if not hasattr(self, '_obj_memoized'): - self._obj_memoized = {} - key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems())) - try: - return self._obj_memoized[key] - except TypeError: - return f(self, *args, **kwargs) - except KeyError: - self._obj_memoized[key] = f(self, *args, **kwargs) - return self._obj_memoized[key] - return wrapper - - -class GradedText(object): - _edoc = None - - ROOT = 'utwor' - RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' - - def __init__(self, text): - self._text = text - - @obj_memoized - def is_xml(self): - """ - Determines if it's a well-formed XML. - - >>> GradedText("").is_xml() - True - >>> GradedText("").is_xml() - False - """ - try: - self._edoc = etree.fromstring(self._text) - except etree.XMLSyntaxError: - return False - return True - - @obj_memoized - def is_wl(self): - """ - Determines if it's an XML with a and a master tag. - - >>> GradedText("").is_wl() - True - >>> GradedText("").is_wl() - False - """ - if self.is_xml(): - e = self._edoc - # FIXME: there could be comments - ret = e.tag == self.ROOT and ( - len(e) == 1 and e[0].tag in MASTERS or - len(e) == 2 and e[0].tag == self.RDF - and e[1].tag in MASTERS) - if ret: - self._master = e[-1].tag - del self._edoc - return ret - else: - return False - - @obj_memoized - def is_broken_wl(self): - """ - Determines if it at least looks like broken WL file - and not just some untagged text. - - >>> GradedText("<").is_broken_wl() - True - >>> GradedText("some text").is_broken_wl() - False - """ - if self.is_wl(): - return True - text = self._text.strip() - return text.startswith('') and text.endswith('') - - def master(self): - """ - Gets the master tag. - - >>> GradedText("").master() - 'powiesc' - """ - assert self.is_wl() - return self._master - - @obj_memoized - def has_trim_begin(self): - return RE_TRIM_BEGIN.search(self._text) - - @obj_memoized - def has_trim_end(self): - return RE_TRIM_END.search(self._text) - - def _trim(text, trim_begin=True, trim_end=True): - """ + """ Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so that eg. one big XML file can be compiled from many small XML files. """ @@ -129,7 +27,7 @@ def _trim(text, trim_begin=True, trim_end=True): def compile_text(parts): - """ + """ Compiles full text from an iterable of parts, trimming where applicable. """ @@ -139,10 +37,11 @@ def compile_text(parts): for next_text in parts: if not next_text: continue - # trim the end, because there's more non-empty text - # don't trim beginning, if `text' is the first non-empty part - texts.append(_trim(text, trim_begin=trim_begin)) - trim_begin = True + if text: + # trim the end, because there's more non-empty text + # don't trim beginning, if `text' is the first non-empty part + texts.append(_trim(text, trim_begin=trim_begin)) + trim_begin = True text = next_text # don't trim the end, because there's no more text coming after `text' # only trim beginning if it's not still the first non-empty @@ -150,26 +49,6 @@ def compile_text(parts): return "".join(texts) -def change_master(text, master): - """ - Changes the master tag in a WL document. - """ - e = etree.fromstring(text) - e[-1].tag = master - return etree.tostring(e, encoding="utf-8") - - -def basic_structure(text, master): - e = etree.fromstring(''' - - - -''' % (TRIM_BEGIN, TRIM_END)) - e[0].tag = master - e[0][0].tail = "\n"*3 + text + "\n"*3 - return etree.tostring(e, encoding="utf-8") - - def add_trim_begin(text): trim_tag = etree.Comment(TRIM_BEGIN) e = etree.fromstring(text) @@ -182,7 +61,7 @@ def add_trim_begin(text): master.insert(0, trim_tag) trim_tag.tail = '\n\n\n' + (master.text or '') master.text = '\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') def add_trim_end(text): @@ -201,4 +80,161 @@ def add_trim_end(text): prev.tail = (prev.tail or '') + '\n\n\n' else: master.text = (master.text or '') + '\n\n\n' - return etree.tostring(e, encoding="utf-8") + return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8') + + +def split_xml(text): + """Splits text into chapters. + + All this stuff really must go somewhere else. + + """ + src = etree.fromstring(text) + chunks = [] + + splitter = u'naglowek_rozdzial' + parts = src.findall('.//naglowek_rozdzial') + while parts: + # copy the document + copied = deepcopy(src) + + element = parts[-1] + + # find the chapter's title + name_elem = deepcopy(element) + for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga': + for a in name_elem.findall('.//' + tag): + a.text = '' + del a[:] + name = etree.tostring(name_elem, method='text', encoding='utf-8').strip() + + # in the original, remove everything from the start of the last chapter + parent = element.getparent() + del parent[parent.index(element):] + element, parent = parent, parent.getparent() + while parent is not None: + del parent[parent.index(element) + 1:] + element, parent = parent, parent.getparent() + + # in the copy, remove everything before the last chapter + element = copied.findall('.//naglowek_rozdzial')[-1] + parent = element.getparent() + while parent is not None: + parent.text = None + while parent[0] is not element: + del parent[0] + element, parent = parent, parent.getparent() + chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]] + + parts = src.findall('.//naglowek_rozdzial') + + chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]] + + for ch in chunks[1:]: + ch[1] = add_trim_begin(ch[1]) + for ch in chunks[:-1]: + ch[1] = add_trim_end(ch[1]) + + return chunks + + +def wl2_to_wl1(wl2_xml, slug): + from lxml import etree + import re + from StringIO import StringIO + from urllib import unquote + import os.path + from django.conf import settings + from fnpdjango.utils.text.slughifi import slughifi + from librarian import ParseError, DCNS + + def _register_function(f): + """ Register extension function with lxml """ + ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') + ns[f.__name__] = f + return f + + @_register_function + def slugify(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return slughifi(text) + + @_register_function + def rmext(context, text): + if isinstance(text, list): + text = ''.join(text) + text = unquote(text) + if '.' in text: + name, ext = text.rsplit('.', 1) + if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'): + text = name + return text + + t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt')) + ft = wl2_xml.replace(' ', ' ') + f2 = StringIO(ft) + i1 = etree.parse(f2) + + for sect in i1.findall('//section'): + if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć': + # Prostujemy. + first = sect.find('section') + subs = first.findall('.//section') + for sub in subs: + sect.append(sub) + break + else: + dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri}) + if dc_type and dc_type[0] in ('course', 'synthetic'): + raise ParseError('Brak przebiegu') + + i1.getroot().attrib['redslug'] = slug + i1.getroot().attrib['wlslug'] = slug # THIS! + w1t = i1.xslt(t) + for h in w1t.findall('//aktywnosc/opis'): + if len(h) == 0: + raise ParseError('Pusty element aktywnosc/opis') + # FIXME assumption that every lesson has at most 9 parts + if not h[0].text or not re.match(r'\d\.\s', h[0].text): + raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) + h[0].text = h[0].text[3:] + return etree.tostring(w1t, encoding='utf-8') + + +EXCEPTIONS = [ + ('div', 'img'), + ('div', 'video'), + ('div', 'table.cell'), +] + + +def remove_element(element): + parent = element.getparent() + tail = element.tail + if tail: + prev = element.getprevious() + if prev is not None: + prev.tail = (prev.tail or '') + tail + else: + parent.text = (parent.text or '') + tail + parent.remove(element) + + +def remove_empty_elements(xml): + try: + tree = etree.fromstring(force_str(xml.replace(' ', u'\xa0'))) + except SyntaxError: + return None + changed = False + another_loop = True + while another_loop: + another_loop = False + for element in tree.findall('.//*'): + if (not element.text or not element.text.strip()) and len(element) == 0: + if (element.tag, element.attrib.get('class')) not in EXCEPTIONS: + remove_element(element) + changed = True + another_loop = True + return etree.tostring(tree, encoding=unicode) if changed else None