apps/catalogue/xml_tools.py

   1 # -*- coding: utf-8 -*-
   2 from copy import deepcopy
   3 import re
   4
   5 from django.utils.encoding import force_str
   6 from lxml import etree
   7 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
   8
   9 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
  10 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
  11
  12
  13 class ParseError(BaseException):
  14     pass
  15
  16
  17 def _trim(text, trim_begin=True, trim_end=True):
  18     """
  19         Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
  20         that eg. one big XML file can be compiled from many small XML files.
  21     """
  22     if trim_begin:
  23         text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
  24     if trim_end:
  25         text = RE_TRIM_END.split(text, maxsplit=1)[0]
  26     return text
  27
  28
  29 def compile_text(parts):
  30     """
  31         Compiles full text from an iterable of parts,
  32         trimming where applicable.
  33     """
  34     texts = []
  35     trim_begin = False
  36     text = ''
  37     for next_text in parts:
  38         if not next_text:
  39             continue
  40         if text:
  41             # trim the end, because there's more non-empty text
  42             # don't trim beginning, if `text' is the first non-empty part
  43             texts.append(_trim(text, trim_begin=trim_begin))
  44             trim_begin = True
  45         text = next_text
  46     # don't trim the end, because there's no more text coming after `text'
  47     # only trim beginning if it's not still the first non-empty
  48     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
  49     return "".join(texts)
  50
  51
  52 def add_trim_begin(text):
  53     trim_tag = etree.Comment(TRIM_BEGIN)
  54     e = etree.fromstring(text)
  55     for master in e[::-1]:
  56         if master.tag in MASTERS:
  57             break
  58     if master.tag not in MASTERS:
  59         raise ParseError('No master tag found!')
  60
  61     master.insert(0, trim_tag)
  62     trim_tag.tail = '\n\n\n' + (master.text or '')
  63     master.text = '\n'
  64     return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  65
  66
  67 def add_trim_end(text):
  68     trim_tag = etree.Comment(TRIM_END)
  69     e = etree.fromstring(text)
  70     for master in e[::-1]:
  71         if master.tag in MASTERS:
  72             break
  73     if master.tag not in MASTERS:
  74         raise ParseError('No master tag found!')
  75
  76     master.append(trim_tag)
  77     trim_tag.tail = '\n'
  78     prev = trim_tag.getprevious()
  79     if prev is not None:
  80         prev.tail = (prev.tail or '') + '\n\n\n'
  81     else:
  82         master.text = (master.text or '') + '\n\n\n'
  83     return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  84
  85
  86 def split_xml(text):
  87     """Splits text into chapters.
  88
  89     All this stuff really must go somewhere else.
  90
  91     """
  92     src = etree.fromstring(text)
  93     chunks = []
  94
  95     splitter = u'naglowek_rozdzial'
  96     parts = src.findall('.//naglowek_rozdzial')
  97     while parts:
  98         # copy the document
  99         copied = deepcopy(src)
 100
 101         element = parts[-1]
 102
 103         # find the chapter's title
 104         name_elem = deepcopy(element)
 105         for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
 106             for a in name_elem.findall('.//' + tag):
 107                 a.text = ''
 108                 del a[:]
 109         name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
 110
 111         # in the original, remove everything from the start of the last chapter
 112         parent = element.getparent()
 113         del parent[parent.index(element):]
 114         element, parent = parent, parent.getparent()
 115         while parent is not None:
 116             del parent[parent.index(element) + 1:]
 117             element, parent = parent, parent.getparent()
 118
 119         # in the copy, remove everything before the last chapter
 120         element = copied.findall('.//naglowek_rozdzial')[-1]
 121         parent = element.getparent()
 122         while parent is not None:
 123             parent.text = None
 124             while parent[0] is not element:
 125                 del parent[0]
 126             element, parent = parent, parent.getparent()
 127         chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]]
 128
 129         parts = src.findall('.//naglowek_rozdzial')
 130
 131     chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]]
 132
 133     for ch in chunks[1:]:
 134         ch[1] = add_trim_begin(ch[1])
 135     for ch in chunks[:-1]:
 136         ch[1] = add_trim_end(ch[1])
 137
 138     return chunks
 139
 140
 141 def wl2_to_wl1(wl2_xml, slug):
 142     from lxml import etree
 143     import re
 144     from StringIO import StringIO
 145     from urllib import unquote
 146     import os.path
 147     from django.conf import settings
 148     from fnpdjango.utils.text.slughifi import slughifi
 149     from librarian import ParseError, DCNS
 150
 151     def _register_function(f):
 152         """ Register extension function with lxml """
 153         ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
 154         ns[f.__name__] = f
 155         return f
 156
 157     @_register_function
 158     def slugify(context, text):
 159         """Remove unneeded whitespace from beginning and end"""
 160         if isinstance(text, list):
 161             text = ''.join(text)
 162         return slughifi(text)
 163
 164     @_register_function
 165     def rmext(context, text):
 166         if isinstance(text, list):
 167             text = ''.join(text)
 168         text = unquote(text)
 169         if '.' in text:
 170             name, ext = text.rsplit('.', 1)
 171             if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'):
 172                 text = name
 173         return text
 174
 175     t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt'))
 176     ft = wl2_xml.replace('&nbsp;', ' ')
 177     f2 = StringIO(ft)
 178     i1 = etree.parse(f2)
 179
 180     for sect in i1.findall('//section'):
 181         if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć':
 182             # Prostujemy.
 183             first = sect.find('section')
 184             subs = first.findall('.//section')
 185             for sub in subs:
 186                 sect.append(sub)
 187             break
 188     else:
 189         dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri})
 190         if dc_type and dc_type[0] in ('course', 'synthetic'):
 191             raise ParseError('Brak przebiegu')
 192
 193     i1.getroot().attrib['redslug'] = slug
 194     i1.getroot().attrib['wlslug'] = slug  # THIS!
 195     w1t = i1.xslt(t)
 196     for h in w1t.findall('//aktywnosc/opis'):
 197         if len(h) == 0:
 198             raise ParseError('Pusty element aktywnosc/opis')
 199         # FIXME assumption that every lesson has at most 9 parts
 200         if not h[0].text or not re.match(r'\d\.\s', h[0].text):
 201             raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text))
 202         h[0].text = h[0].text[3:]
 203     return etree.tostring(w1t, encoding='utf-8')
 204
 205
 206 EXCEPTIONS = [
 207     ('div', 'img'),
 208     ('div', 'video'),
 209     ('div', 'table.cell'),
 210     ('span', 'link'),
 211 ]
 212
 213
 214 def remove_element(element):
 215     parent = element.getparent()
 216     tail = element.tail
 217     if tail:
 218         prev = element.getprevious()
 219         if prev is not None:
 220             prev.tail = (prev.tail or '') + tail
 221         else:
 222             parent.text = (parent.text or '') + tail
 223     parent.remove(element)
 224
 225
 226 def remove_empty_elements(xml):
 227     try:
 228         tree = etree.fromstring(force_str(xml.replace('&nbsp;', u'\xa0')))
 229     except SyntaxError:
 230         return None
 231     changed = False
 232     another_loop = True
 233     while another_loop:
 234         another_loop = False
 235         for element in tree.findall('.//*'):
 236             if (not element.text or not element.text.strip()) and len(element) == 0:
 237                 if (element.tag, element.attrib.get('class')) not in EXCEPTIONS:
 238                     remove_element(element)
 239                     changed = True
 240                     another_loop = True
 241     return etree.tostring(tree, encoding=unicode) if changed else None