X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/4395564bb09e6b7d74cae3421f6342dd546b5619..042196842c177084eb717ed6191e1b3d5d3ef82e:/apps/catalogue/xml_tools.py?ds=sidebyside diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py index 242714b6..7be05fd5 100644 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -14,7 +14,7 @@ class ParseError(BaseException): def _trim(text, trim_begin=True, trim_end=True): - """ + """ Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so that eg. one big XML file can be compiled from many small XML files. """ @@ -26,7 +26,7 @@ def _trim(text, trim_begin=True, trim_end=True): def compile_text(parts): - """ + """ Compiles full text from an iterable of parts, trimming where applicable. """ @@ -103,7 +103,7 @@ def split_xml(text): name_elem = deepcopy(element) for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga': for a in name_elem.findall('.//' + tag): - a.text='' + a.text = '' del a[:] name = etree.tostring(name_elem, method='text', encoding='utf-8').strip() @@ -123,15 +123,11 @@ def split_xml(text): while parent[0] is not element: del parent[0] element, parent = parent, parent.getparent() - chunks[:0] = [[name, - unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8') - ]] + chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]] parts = src.findall('.//naglowek_rozdzial') - chunks[:0] = [[u'początek', - unicode(etree.tostring(src, encoding='utf-8'), 'utf-8') - ]] + chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]] for ch in chunks[1:]: ch[1] = add_trim_begin(ch[1]) @@ -139,3 +135,68 @@ def split_xml(text): ch[1] = add_trim_end(ch[1]) return chunks + + +def wl2_to_wl1(wl2_xml, slug): + from lxml import etree + import re + from StringIO import StringIO + from urllib import unquote + import os.path + from django.conf import settings + from fnpdjango.utils.text.slughifi import slughifi + from librarian import ParseError, DCNS + + def _register_function(f): + """ Register extension function with lxml """ + ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') + ns[f.__name__] = f + return f + + @_register_function + def slugify(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return slughifi(text) + + @_register_function + def rmext(context, text): + if isinstance(text, list): + text = ''.join(text) + text = unquote(text) + if '.' in text: + name, ext = text.rsplit('.', 1) + if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'): + text = name + return text + + t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt')) + ft = wl2_xml.replace(' ', ' ') + f2 = StringIO(ft) + i1 = etree.parse(f2) + + for sect in i1.findall('//section'): + if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć': + # Prostujemy. + first = sect.find('section') + subs = first.findall('.//section') + for sub in subs: + sect.append(sub) + break + else: + dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri}) + if dc_type and dc_type[0] in ('course', 'synthetic'): + raise ParseError('Brak przebiegu') + + i1.getroot().attrib['redslug'] = slug + i1.getroot().attrib['wlslug'] = slug # THIS! + w1t = i1.xslt(t) + for h in w1t.findall('//aktywnosc/opis'): + if len(h) == 0: + raise ParseError('Pusty element aktywnosc/opis') + # FIXME assumption that every lesson has at most 9 parts + if not h[0].text or not re.match(r'\d\.\s', h[0].text): + raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) + h[0].text = h[0].text[3:] + return etree.tostring(w1t, encoding='utf-8') \ No newline at end of file