1 # -*- coding: utf-8 -*-
2 from copy import deepcopy
6 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
8 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
9 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
12 class ParseError(BaseException):
16 def _trim(text, trim_begin=True, trim_end=True):
18 Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
19 that eg. one big XML file can be compiled from many small XML files.
22 text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
24 text = RE_TRIM_END.split(text, maxsplit=1)[0]
28 def compile_text(parts):
30 Compiles full text from an iterable of parts,
31 trimming where applicable.
36 for next_text in parts:
40 # trim the end, because there's more non-empty text
41 # don't trim beginning, if `text' is the first non-empty part
42 texts.append(_trim(text, trim_begin=trim_begin))
45 # don't trim the end, because there's no more text coming after `text'
46 # only trim beginning if it's not still the first non-empty
47 texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
51 def add_trim_begin(text):
52 trim_tag = etree.Comment(TRIM_BEGIN)
53 e = etree.fromstring(text)
54 for master in e[::-1]:
55 if master.tag in MASTERS:
57 if master.tag not in MASTERS:
58 raise ParseError('No master tag found!')
60 master.insert(0, trim_tag)
61 trim_tag.tail = '\n\n\n' + (master.text or '')
63 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
66 def add_trim_end(text):
67 trim_tag = etree.Comment(TRIM_END)
68 e = etree.fromstring(text)
69 for master in e[::-1]:
70 if master.tag in MASTERS:
72 if master.tag not in MASTERS:
73 raise ParseError('No master tag found!')
75 master.append(trim_tag)
77 prev = trim_tag.getprevious()
79 prev.tail = (prev.tail or '') + '\n\n\n'
81 master.text = (master.text or '') + '\n\n\n'
82 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
86 """Splits text into chapters.
88 All this stuff really must go somewhere else.
91 src = etree.fromstring(text)
94 splitter = u'naglowek_rozdzial'
95 parts = src.findall('.//naglowek_rozdzial')
98 copied = deepcopy(src)
102 # find the chapter's title
103 name_elem = deepcopy(element)
104 for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
105 for a in name_elem.findall('.//' + tag):
108 name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
110 # in the original, remove everything from the start of the last chapter
111 parent = element.getparent()
112 del parent[parent.index(element):]
113 element, parent = parent, parent.getparent()
114 while parent is not None:
115 del parent[parent.index(element) + 1:]
116 element, parent = parent, parent.getparent()
118 # in the copy, remove everything before the last chapter
119 element = copied.findall('.//naglowek_rozdzial')[-1]
120 parent = element.getparent()
121 while parent is not None:
123 while parent[0] is not element:
125 element, parent = parent, parent.getparent()
126 chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]]
128 parts = src.findall('.//naglowek_rozdzial')
130 chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]]
132 for ch in chunks[1:]:
133 ch[1] = add_trim_begin(ch[1])
134 for ch in chunks[:-1]:
135 ch[1] = add_trim_end(ch[1])
140 def wl2_to_wl1(wl2_xml, slug):
141 from lxml import etree
143 from StringIO import StringIO
144 from urllib import unquote
146 from django.conf import settings
147 from fnpdjango.utils.text.slughifi import slughifi
148 from librarian import ParseError, DCNS
150 def _register_function(f):
151 """ Register extension function with lxml """
152 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
157 def slugify(context, text):
158 """Remove unneeded whitespace from beginning and end"""
159 if isinstance(text, list):
161 return slughifi(text)
164 def rmext(context, text):
165 if isinstance(text, list):
169 name, ext = text.rsplit('.', 1)
170 if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'):
174 t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt'))
175 ft = wl2_xml.replace(' ', ' ')
179 for sect in i1.findall('//section'):
180 if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć':
182 first = sect.find('section')
183 subs = first.findall('.//section')
188 dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri})
189 if dc_type and dc_type[0] in ('course', 'synthetic'):
190 raise ParseError('Brak przebiegu')
192 i1.getroot().attrib['redslug'] = slug
193 i1.getroot().attrib['wlslug'] = slug # THIS!
195 for h in w1t.findall('//aktywnosc/opis'):
197 raise ParseError('Pusty element aktywnosc/opis')
198 # FIXME assumption that every lesson has at most 9 parts
199 if not h[0].text or not re.match(r'\d\.\s', h[0].text):
200 raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text))
201 h[0].text = h[0].text[3:]
202 return etree.tostring(w1t, encoding='utf-8')