From 40a993399c67ca16752f44b8c60c41b8e8df838f Mon Sep 17 00:00:00 2001 From: Jan Szejko Date: Thu, 28 Sep 2017 18:54:12 +0200 Subject: [PATCH] add command to convert wl2 to wl1 --- .../management/commands/wl2_to_wl1.py | 15 ++++ apps/catalogue/models/book.py | 68 +------------------ apps/catalogue/xml_tools.py | 65 ++++++++++++++++++ 3 files changed, 82 insertions(+), 66 deletions(-) create mode 100644 apps/catalogue/management/commands/wl2_to_wl1.py diff --git a/apps/catalogue/management/commands/wl2_to_wl1.py b/apps/catalogue/management/commands/wl2_to_wl1.py new file mode 100644 index 00000000..eb2e2518 --- /dev/null +++ b/apps/catalogue/management/commands/wl2_to_wl1.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +from os.path import basename + +from django.core.management.base import BaseCommand + +from catalogue.xml_tools import wl2_to_wl1 + + +class Command(BaseCommand): + help = 'Converts a lesson XML from WL2 to WL1' + args = 'filename' + + def handle(self, filename, *args, **options): + slug = basename(filename).split('.')[0] + print wl2_to_wl1(open(filename).read(), slug) diff --git a/apps/catalogue/models/book.py b/apps/catalogue/models/book.py index 45726551..7589587d 100755 --- a/apps/catalogue/models/book.py +++ b/apps/catalogue/models/book.py @@ -14,7 +14,7 @@ from catalogue.helpers import cached_in_field, GalleryMerger from catalogue.models import BookPublishRecord, ChunkPublishRecord, Project from catalogue.signals import post_publish from catalogue.tasks import refresh_instance, book_content_updated -from catalogue.xml_tools import compile_text, split_xml +from catalogue.xml_tools import compile_text, split_xml, wl2_to_wl1 from cover.models import Image @@ -435,68 +435,4 @@ class Book(models.Model): post_publish.send(sender=br) def wl1_xml(self, publishable=True, changes=None): - from lxml import etree - import re - from StringIO import StringIO - from urllib import unquote - import os.path - from django.conf import settings - from fnpdjango.utils.text.slughifi import slughifi - from librarian import ParseError, DCNS - - def _register_function(f): - """ Register extension function with lxml """ - ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') - ns[f.__name__] = f - return f - - @_register_function - def slugify(context, text): - """Remove unneeded whitespace from beginning and end""" - if isinstance(text, list): - text = ''.join(text) - return slughifi(text) - - @_register_function - def rmext(context, text): - if isinstance(text, list): - text = ''.join(text) - text = unquote(text) - if '.' in text: - name, ext = text.rsplit('.', 1) - if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'): - text = name - return text - - t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt')) - ft = self.materialize(publishable=publishable, changes=changes) - ft = ft.replace(' ', ' ') - f2 = StringIO(ft) - i1 = etree.parse(f2) - - for sect in i1.findall('//section'): - if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć': - # Prostujemy. - first = sect.find('section') - subs = first.findall('.//section') - for sub in subs: - sect.append(sub) - break - else: - # print 'BRAK PRZEBIEGU' - dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri}) - if dc_type and dc_type[0] in ('course', 'synthetic'): - raise ParseError('Brak przebiegu') - - i1.getroot().attrib['redslug'] = self.slug - i1.getroot().attrib['wlslug'] = self.slug # THIS! - # print '.', - w1t = i1.xslt(t) - for h in w1t.findall('//aktywnosc/opis'): - if len(h) == 0: - raise ParseError('Pusty element aktywnosc/opis') - # FIXME assumption that every lesson has at most 9 parts - if not h[0].text or not re.match(r'\d\.\s', h[0].text): - raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) - h[0].text = h[0].text[3:] - return etree.tostring(w1t, encoding='utf-8') + return wl2_to_wl1(self.materialize(publishable=publishable, changes=changes), self.slug) diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py index 75cb2d8c..7be05fd5 100644 --- a/apps/catalogue/xml_tools.py +++ b/apps/catalogue/xml_tools.py @@ -135,3 +135,68 @@ def split_xml(text): ch[1] = add_trim_end(ch[1]) return chunks + + +def wl2_to_wl1(wl2_xml, slug): + from lxml import etree + import re + from StringIO import StringIO + from urllib import unquote + import os.path + from django.conf import settings + from fnpdjango.utils.text.slughifi import slughifi + from librarian import ParseError, DCNS + + def _register_function(f): + """ Register extension function with lxml """ + ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') + ns[f.__name__] = f + return f + + @_register_function + def slugify(context, text): + """Remove unneeded whitespace from beginning and end""" + if isinstance(text, list): + text = ''.join(text) + return slughifi(text) + + @_register_function + def rmext(context, text): + if isinstance(text, list): + text = ''.join(text) + text = unquote(text) + if '.' in text: + name, ext = text.rsplit('.', 1) + if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'): + text = name + return text + + t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt')) + ft = wl2_xml.replace(' ', ' ') + f2 = StringIO(ft) + i1 = etree.parse(f2) + + for sect in i1.findall('//section'): + if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć': + # Prostujemy. + first = sect.find('section') + subs = first.findall('.//section') + for sub in subs: + sect.append(sub) + break + else: + dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri}) + if dc_type and dc_type[0] in ('course', 'synthetic'): + raise ParseError('Brak przebiegu') + + i1.getroot().attrib['redslug'] = slug + i1.getroot().attrib['wlslug'] = slug # THIS! + w1t = i1.xslt(t) + for h in w1t.findall('//aktywnosc/opis'): + if len(h) == 0: + raise ParseError('Pusty element aktywnosc/opis') + # FIXME assumption that every lesson has at most 9 parts + if not h[0].text or not re.match(r'\d\.\s', h[0].text): + raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text)) + h[0].text = h[0].text[3:] + return etree.tostring(w1t, encoding='utf-8') \ No newline at end of file -- 2.20.1