1 # -*- coding: utf-8 -*-
2 from copy import deepcopy
5 from django.utils.encoding import force_str
7 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
9 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
10 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
13 class ParseError(BaseException):
17 def _trim(text, trim_begin=True, trim_end=True):
19 Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
20 that eg. one big XML file can be compiled from many small XML files.
23 text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
25 text = RE_TRIM_END.split(text, maxsplit=1)[0]
29 def compile_text(parts):
31 Compiles full text from an iterable of parts,
32 trimming where applicable.
37 for next_text in parts:
41 # trim the end, because there's more non-empty text
42 # don't trim beginning, if `text' is the first non-empty part
43 texts.append(_trim(text, trim_begin=trim_begin))
46 # don't trim the end, because there's no more text coming after `text'
47 # only trim beginning if it's not still the first non-empty
48 texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
52 def add_trim_begin(text):
53 trim_tag = etree.Comment(TRIM_BEGIN)
54 e = etree.fromstring(text)
55 for master in e[::-1]:
56 if master.tag in MASTERS:
58 if master.tag not in MASTERS:
59 raise ParseError('No master tag found!')
61 master.insert(0, trim_tag)
62 trim_tag.tail = '\n\n\n' + (master.text or '')
64 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
67 def add_trim_end(text):
68 trim_tag = etree.Comment(TRIM_END)
69 e = etree.fromstring(text)
70 for master in e[::-1]:
71 if master.tag in MASTERS:
73 if master.tag not in MASTERS:
74 raise ParseError('No master tag found!')
76 master.append(trim_tag)
78 prev = trim_tag.getprevious()
80 prev.tail = (prev.tail or '') + '\n\n\n'
82 master.text = (master.text or '') + '\n\n\n'
83 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
87 """Splits text into chapters.
89 All this stuff really must go somewhere else.
92 src = etree.fromstring(text)
95 splitter = u'naglowek_rozdzial'
96 parts = src.findall('.//naglowek_rozdzial')
99 copied = deepcopy(src)
103 # find the chapter's title
104 name_elem = deepcopy(element)
105 for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
106 for a in name_elem.findall('.//' + tag):
109 name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
111 # in the original, remove everything from the start of the last chapter
112 parent = element.getparent()
113 del parent[parent.index(element):]
114 element, parent = parent, parent.getparent()
115 while parent is not None:
116 del parent[parent.index(element) + 1:]
117 element, parent = parent, parent.getparent()
119 # in the copy, remove everything before the last chapter
120 element = copied.findall('.//naglowek_rozdzial')[-1]
121 parent = element.getparent()
122 while parent is not None:
124 while parent[0] is not element:
126 element, parent = parent, parent.getparent()
127 chunks[:0] = [[name, unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')]]
129 parts = src.findall('.//naglowek_rozdzial')
131 chunks[:0] = [[u'początek', unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')]]
133 for ch in chunks[1:]:
134 ch[1] = add_trim_begin(ch[1])
135 for ch in chunks[:-1]:
136 ch[1] = add_trim_end(ch[1])
141 def wl2_to_wl1(wl2_xml, slug):
142 from lxml import etree
144 from StringIO import StringIO
145 from urllib import unquote
147 from django.conf import settings
148 from fnpdjango.utils.text.slughifi import slughifi
149 from librarian import ParseError, DCNS
151 def _register_function(f):
152 """ Register extension function with lxml """
153 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
158 def slugify(context, text):
159 """Remove unneeded whitespace from beginning and end"""
160 if isinstance(text, list):
162 return slughifi(text)
165 def rmext(context, text):
166 if isinstance(text, list):
170 name, ext = text.rsplit('.', 1)
171 if ext.lower() in ('doc', 'docx', 'odt', 'pdf', 'jpg', 'jpeg'):
175 t = etree.parse(os.path.join(settings.PROJECT_ROOT, 'xslt/wl2to1.xslt'))
176 ft = wl2_xml.replace(' ', ' ')
180 for sect in i1.findall('//section'):
181 if sect[0].text and sect[0].text.strip() == u'Przebieg zajęć':
183 first = sect.find('section')
184 subs = first.findall('.//section')
189 dc_type = i1.findall('//dc:type', namespaces={'dc': DCNS.uri})
190 if dc_type and dc_type[0] in ('course', 'synthetic'):
191 raise ParseError('Brak przebiegu')
193 i1.getroot().attrib['redslug'] = slug
194 i1.getroot().attrib['wlslug'] = slug # THIS!
196 for h in w1t.findall('//aktywnosc/opis'):
198 raise ParseError('Pusty element aktywnosc/opis')
199 # FIXME assumption that every lesson has at most 9 parts
200 if not h[0].text or not re.match(r'\d\.\s', h[0].text):
201 raise ParseError('Niepoprawny nagłówek (aktywnosc/opis): %s' % repr(h[0].text))
202 h[0].text = h[0].text[3:]
203 return etree.tostring(w1t, encoding='utf-8')
209 ('div', 'table.cell'),
213 def remove_element(element):
214 parent = element.getparent()
217 prev = element.getprevious()
219 prev.tail = (prev.tail or '') + tail
221 parent.text = (parent.text or '') + tail
222 parent.remove(element)
225 def remove_empty_elements(xml):
227 tree = etree.fromstring(force_str(xml.replace(' ', u'\xa0')))
234 for element in tree.findall('.//*'):
235 if (not element.text or not element.text.strip()) and len(element) == 0:
236 if (element.tag, element.attrib.get('class')) not in EXCEPTIONS:
237 remove_element(element)
240 return etree.tostring(tree, encoding=unicode) if changed else None