apps/catalogue/xml_tools.py

   1 # -*- coding: utf-8 -*-
   2 from copy import deepcopy
   3 import re
   4
   5 from lxml import etree
   6 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
   7
   8 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
   9 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
  10
  11
  12 class ParseError(BaseException):
  13     pass
  14
  15
  16 def _trim(text, trim_begin=True, trim_end=True):
  17     """
  18         Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
  19         that eg. one big XML file can be compiled from many small XML files.
  20     """
  21     if trim_begin:
  22         text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
  23     if trim_end:
  24         text = RE_TRIM_END.split(text, maxsplit=1)[0]
  25     return text
  26
  27
  28 def compile_text(parts):
  29     """
  30         Compiles full text from an iterable of parts,
  31         trimming where applicable.
  32     """
  33     texts = []
  34     trim_begin = False
  35     text = ''
  36     for next_text in parts:
  37         if not next_text:
  38             continue
  39         if text:
  40             # trim the end, because there's more non-empty text
  41             # don't trim beginning, if `text' is the first non-empty part
  42             texts.append(_trim(text, trim_begin=trim_begin))
  43             trim_begin = True
  44         text = next_text
  45     # don't trim the end, because there's no more text coming after `text'
  46     # only trim beginning if it's not still the first non-empty
  47     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
  48     return "".join(texts)
  49
  50
  51 def add_trim_begin(text):
  52     trim_tag = etree.Comment(TRIM_BEGIN)
  53     e = etree.fromstring(text)
  54     for master in e[::-1]:
  55         if master.tag in MASTERS:
  56             break
  57     if master.tag not in MASTERS:
  58         raise ParseError('No master tag found!')
  59
  60     master.insert(0, trim_tag)
  61     trim_tag.tail = '\n\n\n' + (master.text or '')
  62     master.text = '\n'
  63     return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  64
  65
  66 def add_trim_end(text):
  67     trim_tag = etree.Comment(TRIM_END)
  68     e = etree.fromstring(text)
  69     for master in e[::-1]:
  70         if master.tag in MASTERS:
  71             break
  72     if master.tag not in MASTERS:
  73         raise ParseError('No master tag found!')
  74
  75     master.append(trim_tag)
  76     trim_tag.tail = '\n'
  77     prev = trim_tag.getprevious()
  78     if prev is not None:
  79         prev.tail = (prev.tail or '') + '\n\n\n'
  80     else:
  81         master.text = (master.text or '') + '\n\n\n'
  82     return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
  83
  84
  85 def split_xml(text):
  86     """Splits text into chapters.
  87
  88     All this stuff really must go somewhere else.
  89
  90     """
  91     src = etree.fromstring(text)
  92     chunks = []
  93
  94     splitter = u'naglowek_rozdzial'
  95     parts = src.findall('.//naglowek_rozdzial')
  96     while parts:
  97         # copy the document
  98         copied = deepcopy(src)
  99
 100         element = parts[-1]
 101
 102         # find the chapter's title
 103         name_elem = deepcopy(element)
 104         for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
 105             for a in name_elem.findall('.//' + tag):
 106                 a.text = ''
 107                 del a[:]
 108         name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
 109
 110         # in the original, remove everything from the start of the last chapter
 111         parent = element.getparent()
 112         del parent[parent.index(element):]
 113         element, parent = parent, parent.getparent()
 114         while parent is not None:
 115             del parent[parent.index(element) + 1:]
 116             element, parent = parent, parent.getparent()
 117
 118         # in the copy, remove everything before the last chapter
 119         element = copied.findall('.//naglowek_rozdzial')[-1]
 120         parent = element.getparent()
 121         while parent is not None:
 122             parent.text = None
 123             while parent[0] is not element:
 124                 del parent[0]
 125             element, parent = parent, parent.getparent()
 126         chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]]
 127
 128         parts = src.findall('.//naglowek_rozdzial')
 129
 130     chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]]
 131
 132     for ch in chunks[1:]:
 133         ch[1] = add_trim_begin(ch[1])
 134     for ch in chunks[:-1]:
 135         ch[1] = add_trim_end(ch[1])
 136
 137     return chunks
 138
 139
 140 def wl2_to_wl1(wl2_xml, slug):
 141     from lxml import etree
 142     import re
 143     from StringIO import StringIO
 144     from urllib import unquote
 145     import os.path
 146     from django.conf import settings
 147     from fnpdjango.utils.text.slughifi import slughifi
 148     from librarian import ParseError, DCNS
 149
 150     def _register_function(f):
 151         """ Register extension function with lxml """
 152         ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
 153         ns[f.__name__] = f
 154         return f
 155
 156     @_register_function
 157     def slugify(context, text):
 158         """Remove unneeded whitespace from beginning and end"""
 159         if isinstance(text, list):
 160             text = ''.join(text)
 161         return slughifi(text)
 162
 163     @_register_function
 164     def rmext(context, text):
 165         if isinstance(text, list):
 166             text = ''.join(text)
 167         text = unquote(text)
 168         if '.' in text:
 169             name, ext = text.rsplit('.', 1)
 170             if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'):
 171                 text = name
 172         return text
 173
 174     t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt'))
 175     ft = wl2_xml.replace('&nbsp;', ' ')
 176     f2 = StringIO(ft)
 177     i1 = etree.parse(f2)
 178
 179     for sect in i1.findall('//section'):
 180         if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć':
 181             # Prostujemy.
 182             first = sect.find('section')
 183             subs = first.findall('.//section')
 184             for sub in subs:
 185                 sect.append(sub)
 186             break
 187     else:
 188         dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri})
 189         if dc_type and dc_type[0] in ('course', 'synthetic'):
 190             raise ParseError('Brak przebiegu')
 191
 192     i1.getroot().attrib['redslug'] = slug
 193     i1.getroot().attrib['wlslug'] = slug  # THIS!
 194     w1t = i1.xslt(t)
 195     for h in w1t.findall('//aktywnosc/opis'):
 196         if len(h) == 0:
 197             raise ParseError('Pusty element aktywnosc/opis')
 198         # FIXME assumption that every lesson has at most 9 parts
 199         if not h[0].text or not re.match(r'\d\.\s', h[0].text):
 200             raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text))
 201         h[0].text = h[0].text[3:]
 202     return etree.tostring(w1t, encoding='utf-8')