X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/2f9c60b76f3ab4e69d794a6bb14388a81ff29eb7..a802a933f53625f732e9d3807b4162e770c8aaf1:/apps/catalogue/xml_tools.py?ds=inline
diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py
old mode 100755
new mode 100644
index 928e57be..abab7f27
--- a/apps/catalogue/xml_tools.py
+++ b/apps/catalogue/xml_tools.py
@@ -1,6 +1,8 @@
-from functools import wraps
+# -*- coding: utf-8 -*-
+from copy import deepcopy
import re
+from django.utils.encoding import force_str
from lxml import etree
from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
@@ -12,112 +14,8 @@ class ParseError(BaseException):
pass
-def obj_memoized(f):
- """
- A decorator that caches return value of object methods.
- The cache is kept with the object, in a _obj_memoized property.
- """
- @wraps(f)
- def wrapper(self, *args, **kwargs):
- if not hasattr(self, '_obj_memoized'):
- self._obj_memoized = {}
- key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems()))
- try:
- return self._obj_memoized[key]
- except TypeError:
- return f(self, *args, **kwargs)
- except KeyError:
- self._obj_memoized[key] = f(self, *args, **kwargs)
- return self._obj_memoized[key]
- return wrapper
-
-
-class GradedText(object):
- _edoc = None
-
- ROOT = 'utwor'
- RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'
-
- def __init__(self, text):
- self._text = text
-
- @obj_memoized
- def is_xml(self):
- """
- Determines if it's a well-formed XML.
-
- >>> GradedText("").is_xml()
- True
- >>> GradedText("").is_xml()
- False
- """
- try:
- self._edoc = etree.fromstring(self._text)
- except etree.XMLSyntaxError:
- return False
- return True
-
- @obj_memoized
- def is_wl(self):
- """
- Determines if it's an XML with a and a master tag.
-
- >>> GradedText("").is_wl()
- True
- >>> GradedText("").is_wl()
- False
- """
- if self.is_xml():
- e = self._edoc
- # FIXME: there could be comments
- ret = e.tag == self.ROOT and (
- len(e) == 1 and e[0].tag in MASTERS or
- len(e) == 2 and e[0].tag == self.RDF
- and e[1].tag in MASTERS)
- if ret:
- self._master = e[-1].tag
- del self._edoc
- return ret
- else:
- return False
-
- @obj_memoized
- def is_broken_wl(self):
- """
- Determines if it at least looks like broken WL file
- and not just some untagged text.
-
- >>> GradedText("<").is_broken_wl()
- True
- >>> GradedText("some text").is_broken_wl()
- False
- """
- if self.is_wl():
- return True
- text = self._text.strip()
- return text.startswith('') and text.endswith('')
-
- def master(self):
- """
- Gets the master tag.
-
- >>> GradedText("").master()
- 'powiesc'
- """
- assert self.is_wl()
- return self._master
-
- @obj_memoized
- def has_trim_begin(self):
- return RE_TRIM_BEGIN.search(self._text)
-
- @obj_memoized
- def has_trim_end(self):
- return RE_TRIM_END.search(self._text)
-
-
def _trim(text, trim_begin=True, trim_end=True):
- """
+ """
Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
that eg. one big XML file can be compiled from many small XML files.
"""
@@ -129,7 +27,7 @@ def _trim(text, trim_begin=True, trim_end=True):
def compile_text(parts):
- """
+ """
Compiles full text from an iterable of parts,
trimming where applicable.
"""
@@ -139,10 +37,11 @@ def compile_text(parts):
for next_text in parts:
if not next_text:
continue
- # trim the end, because there's more non-empty text
- # don't trim beginning, if `text' is the first non-empty part
- texts.append(_trim(text, trim_begin=trim_begin))
- trim_begin = True
+ if text:
+ # trim the end, because there's more non-empty text
+ # don't trim beginning, if `text' is the first non-empty part
+ texts.append(_trim(text, trim_begin=trim_begin))
+ trim_begin = True
text = next_text
# don't trim the end, because there's no more text coming after `text'
# only trim beginning if it's not still the first non-empty
@@ -150,26 +49,6 @@ def compile_text(parts):
return "".join(texts)
-def change_master(text, master):
- """
- Changes the master tag in a WL document.
- """
- e = etree.fromstring(text)
- e[-1].tag = master
- return etree.tostring(e, encoding="utf-8")
-
-
-def basic_structure(text, master):
- e = etree.fromstring('''
-
-
-
-''' % (TRIM_BEGIN, TRIM_END))
- e[0].tag = master
- e[0][0].tail = "\n"*3 + text + "\n"*3
- return etree.tostring(e, encoding="utf-8")
-
-
def add_trim_begin(text):
trim_tag = etree.Comment(TRIM_BEGIN)
e = etree.fromstring(text)
@@ -182,7 +61,7 @@ def add_trim_begin(text):
master.insert(0, trim_tag)
trim_tag.tail = '\n\n\n' + (master.text or '')
master.text = '\n'
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
def add_trim_end(text):
@@ -201,4 +80,161 @@ def add_trim_end(text):
prev.tail = (prev.tail or '') + '\n\n\n'
else:
master.text = (master.text or '') + '\n\n\n'
- return etree.tostring(e, encoding="utf-8")
+ return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
+
+
+def split_xml(text):
+ """Splits text into chapters.
+
+ All this stuff really must go somewhere else.
+
+ """
+ src = etree.fromstring(text)
+ chunks = []
+
+ splitter = u'naglowek_rozdzial'
+ parts = src.findall('.//naglowek_rozdzial')
+ while parts:
+ # copy the document
+ copied = deepcopy(src)
+
+ element = parts[-1]
+
+ # find the chapter's title
+ name_elem = deepcopy(element)
+ for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
+ for a in name_elem.findall('.//' + tag):
+ a.text = ''
+ del a[:]
+ name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
+
+ # in the original, remove everything from the start of the last chapter
+ parent = element.getparent()
+ del parent[parent.index(element):]
+ element, parent = parent, parent.getparent()
+ while parent is not None:
+ del parent[parent.index(element) + 1:]
+ element, parent = parent, parent.getparent()
+
+ # in the copy, remove everything before the last chapter
+ element = copied.findall('.//naglowek_rozdzial')[-1]
+ parent = element.getparent()
+ while parent is not None:
+ parent.text = None
+ while parent[0] is not element:
+ del parent[0]
+ element, parent = parent, parent.getparent()
+ chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]]
+
+ parts = src.findall('.//naglowek_rozdzial')
+
+ chunks[:0] = [[u'poczÄ
tek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]]
+
+ for ch in chunks[1:]:
+ ch[1] = add_trim_begin(ch[1])
+ for ch in chunks[:-1]:
+ ch[1] = add_trim_end(ch[1])
+
+ return chunks
+
+
+def wl2_to_wl1(wl2_xml, slug):
+ from lxml import etree
+ import re
+ from StringIO import StringIO
+ from urllib import unquote
+ import os.path
+ from django.conf import settings
+ from fnpdjango.utils.text.slughifi import slughifi
+ from librarian import ParseError, DCNS
+
+ def _register_function(f):
+ """ Register extension function with lxml """
+ ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
+ ns[f.__name__] = f
+ return f
+
+ @_register_function
+ def slugify(context, text):
+ """Remove unneeded whitespace from beginning and end"""
+ if isinstance(text, list):
+ text = ''.join(text)
+ return slughifi(text)
+
+ @_register_function
+ def rmext(context, text):
+ if isinstance(text, list):
+ text = ''.join(text)
+ text = unquote(text)
+ if '.' in text:
+ name, ext = text.rsplit('.', 1)
+ if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'):
+ text = name
+ return text
+
+ t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt'))
+ ft = wl2_xml.replace(' ', ' ')
+ f2 = StringIO(ft)
+ i1 = etree.parse(f2)
+
+ for sect in i1.findall('//section'):
+ if sect[0].text and sect[0].text.strip() == u'Przebieg zajÄÄ':
+ # Prostujemy.
+ first = sect.find('section')
+ subs = first.findall('.//section')
+ for sub in subs:
+ sect.append(sub)
+ break
+ else:
+ dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri})
+ if dc_type and dc_type[0] in ('course', 'synthetic'):
+ raise ParseError('Brak przebiegu')
+
+ i1.getroot().attrib['redslug'] = slug
+ i1.getroot().attrib['wlslug'] = slug # THIS!
+ w1t = i1.xslt(t)
+ for h in w1t.findall('//aktywnosc/opis'):
+ if len(h) == 0:
+ raise ParseError('Pusty element aktywnosc/opis')
+ # FIXME assumption that every lesson has at most 9 parts
+ if not h[0].text or not re.match(r'\d\.\s', h[0].text):
+ raise ParseError('Niepoprawny nagÅówek (aktywnosc/opis): %s' % repr(h[0].text))
+ h[0].text = h[0].text[3:]
+ return etree.tostring(w1t, encoding='utf-8')
+
+
+EXCEPTIONS = [
+ ('div', 'img'),
+ ('div', 'video'),
+ ('div', 'table.cell'),
+]
+
+
+def remove_element(element):
+ parent = element.getparent()
+ tail = element.tail
+ if tail:
+ prev = element.getprevious()
+ if prev is not None:
+ prev.tail = (prev.tail or '') + tail
+ else:
+ parent.text = (parent.text or '') + tail
+ parent.remove(element)
+
+
+def remove_empty_elements(xml):
+ try:
+ tree = etree.fromstring(force_str(xml.replace(' ', u'\xa0')))
+ except SyntaxError:
+ return None
+ changed = False
+ another_loop = True
+ while another_loop:
+ another_loop = False
+ for element in tree.findall('.//*'):
+ if (not element.text or not element.text.strip()) and len(element) == 0:
+ if (element.tag, element.attrib.get('class')) not in EXCEPTIONS:
+ remove_element(element)
+ changed = True
+ another_loop = True
+ return etree.tostring(tree, encoding=unicode) if changed else None