1 # -*- coding: utf-8 -*-
2 from copy import deepcopy
3 from functools import wraps
7 from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS
9 RE_TRIM_BEGIN = re.compile("^<!--%s-->$" % TRIM_BEGIN, re.M)
10 RE_TRIM_END = re.compile("^<!--%s-->$" % TRIM_END, re.M)
13 class ParseError(BaseException):
19 A decorator that caches return value of object methods.
20 The cache is kept with the object, in a _obj_memoized property.
23 def wrapper(self, *args, **kwargs):
24 if not hasattr(self, '_obj_memoized'):
25 self._obj_memoized = {}
26 key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems()))
28 return self._obj_memoized[key]
30 return f(self, *args, **kwargs)
32 self._obj_memoized[key] = f(self, *args, **kwargs)
33 return self._obj_memoized[key]
37 class GradedText(object):
41 RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'
43 def __init__(self, text):
49 Determines if it's a well-formed XML.
51 >>> GradedText("<a/>").is_xml()
53 >>> GradedText("<a>").is_xml()
57 self._edoc = etree.fromstring(self._text)
58 except etree.XMLSyntaxError:
65 Determines if it's an XML with a <utwor> and a master tag.
67 >>> GradedText("<utwor><powiesc></powiesc></utwor>").is_wl()
69 >>> GradedText("<a></a>").is_wl()
74 # FIXME: there could be comments
75 ret = e.tag == self.ROOT and (
76 len(e) == 1 and e[0].tag in MASTERS or
77 len(e) == 2 and e[0].tag == self.RDF
78 and e[1].tag in MASTERS)
80 self._master = e[-1].tag
87 def is_broken_wl(self):
89 Determines if it at least looks like broken WL file
90 and not just some untagged text.
92 >>> GradedText("<utwor><</utwor>").is_broken_wl()
94 >>> GradedText("some text").is_broken_wl()
99 text = self._text.strip()
100 return text.startswith('<utwor>') and text.endswith('</utwor>')
106 >>> GradedText("<utwor><powiesc></powiesc></utwor>").master()
113 def has_trim_begin(self):
114 return RE_TRIM_BEGIN.search(self._text)
117 def has_trim_end(self):
118 return RE_TRIM_END.search(self._text)
121 def _trim(text, trim_begin=True, trim_end=True):
123 Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
124 that eg. one big XML file can be compiled from many small XML files.
127 text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
129 text = RE_TRIM_END.split(text, maxsplit=1)[0]
133 def compile_text(parts):
135 Compiles full text from an iterable of parts,
136 trimming where applicable.
141 for next_text in parts:
145 # trim the end, because there's more non-empty text
146 # don't trim beginning, if `text' is the first non-empty part
147 texts.append(_trim(text, trim_begin=trim_begin))
150 # don't trim the end, because there's no more text coming after `text'
151 # only trim beginning if it's not still the first non-empty
152 texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
153 return "".join(texts)
156 def change_master(text, master):
158 Changes the master tag in a WL document.
160 e = etree.fromstring(text)
162 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
165 def basic_structure(text, master):
166 e = etree.fromstring('''<utwor>
170 </utwor>''' % (TRIM_BEGIN, TRIM_END))
172 e[0][0].tail = "\n"*3 + text + "\n"*3
173 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
176 def add_trim_begin(text):
177 trim_tag = etree.Comment(TRIM_BEGIN)
178 e = etree.fromstring(text)
179 for master in e[::-1]:
180 if master.tag in MASTERS:
182 if master.tag not in MASTERS:
183 raise ParseError('No master tag found!')
185 master.insert(0, trim_tag)
186 trim_tag.tail = '\n\n\n' + (master.text or '')
188 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
191 def add_trim_end(text):
192 trim_tag = etree.Comment(TRIM_END)
193 e = etree.fromstring(text)
194 for master in e[::-1]:
195 if master.tag in MASTERS:
197 if master.tag not in MASTERS:
198 raise ParseError('No master tag found!')
200 master.append(trim_tag)
202 prev = trim_tag.getprevious()
204 prev.tail = (prev.tail or '') + '\n\n\n'
206 master.text = (master.text or '') + '\n\n\n'
207 return unicode(etree.tostring(e, encoding="utf-8"), 'utf-8')
211 """Splits text into chapters.
213 All this stuff really must go somewhere else.
216 src = etree.fromstring(text)
219 splitter = u'naglowek_rozdzial'
220 parts = src.findall('.//naglowek_rozdzial')
223 copied = deepcopy(src)
227 # find the chapter's title
228 name_elem = deepcopy(element)
229 for tag in 'extra', 'motyw', 'pa', 'pe', 'pr', 'pt', 'uwaga':
230 for a in name_elem.findall('.//' + tag):
233 name = etree.tostring(name_elem, method='text', encoding='utf-8').strip()
235 # in the original, remove everything from the start of the last chapter
236 parent = element.getparent()
237 del parent[parent.index(element):]
238 element, parent = parent, parent.getparent()
239 while parent is not None:
240 del parent[parent.index(element) + 1:]
241 element, parent = parent, parent.getparent()
243 # in the copy, remove everything before the last chapter
244 element = copied.findall('.//naglowek_rozdzial')[-1]
245 parent = element.getparent()
246 while parent is not None:
248 while parent[0] is not element:
250 element, parent = parent, parent.getparent()
252 unicode(etree.tostring(copied, encoding='utf-8'), 'utf-8')
255 parts = src.findall('.//naglowek_rozdzial')
257 chunks[:0] = [[u'poczÄ…tek',
258 unicode(etree.tostring(src, encoding='utf-8'), 'utf-8')
261 for ch in chunks[1:]:
262 ch[1] = add_trim_begin(ch[1])
263 for ch in chunks[:-1]:
264 ch[1] = add_trim_end(ch[1])