X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/4437d85206a7deb768c75a4fd1cb1b474e87efe3..a7a2b1c6daaa2266645abc5ee4fac91ce10a4c39:/apps/wiki/xml_tools.py?ds=sidebyside diff --git a/apps/wiki/xml_tools.py b/apps/wiki/xml_tools.py index a4de433c..6dc50893 100755 --- a/apps/wiki/xml_tools.py +++ b/apps/wiki/xml_tools.py @@ -1,59 +1,120 @@ +from functools import wraps import re from lxml import etree +from wiki.constants import TRIM_BEGIN, TRIM_END, MASTERS + +RE_TRIM_BEGIN = re.compile("^$" % TRIM_BEGIN, re.M) +RE_TRIM_END = re.compile("^$" % TRIM_END, re.M) + + +class ParseError(BaseException): + pass + + +def obj_memoized(f): + """ + A decorator that caches return value of object methods. + The cache is kept with the object, in a _obj_memoized property. + """ + @wraps(f) + def wrapper(self, *args, **kwargs): + if not hasattr(self, '_obj_memoized'): + self._obj_memoized = {} + key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems())) + try: + return self._obj_memoized[key] + except TypeError: + return f(self, *args, **kwargs) + except KeyError: + self._obj_memoized[key] = f(self, *args, **kwargs) + return self._obj_memoized[key] + return wrapper -from wiki.constants import RE_TRIM_BEGIN, RE_TRIM_END class GradedText(object): - _is_xml = None _edoc = None - _is_wl = None - _master = None ROOT = 'utwor' - MASTERS = ['powiesc', - 'opowiadanie', - 'liryka_l', - 'liryka_lp', - 'dramat_wierszowany_l', - 'dramat_wierszowany_lp', - 'dramat_wspolczesny', - ] RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' def __init__(self, text): self._text = text + @obj_memoized def is_xml(self): - if self._is_xml is None: - try: - self._edoc = etree.fromstring(self._text) - except etree.XMLSyntaxError: - self._is_xml = False - else: - self._is_xml = True - del self._text - return self._is_xml + """ + Determines if it's a well-formed XML. + >>> GradedText("").is_xml() + True + >>> GradedText("").is_xml() + False + """ + try: + self._edoc = etree.fromstring(self._text) + except etree.XMLSyntaxError: + return False + return True + + @obj_memoized def is_wl(self): - if self._is_wl is None: - if self.is_xml(): - e = self._edoc - self._is_wl = e.tag == self.ROOT and ( - len(e) == 1 and e[0].tag in self.MASTERS or - len(e) == 2 and e[0].tag == self.RDF - and e[1].tag in self.MASTERS) - if self._is_wl: - self._master = e[-1].tag - del self._edoc - else: - self._is_wl = False - return self._is_wl + """ + Determines if it's an XML with a and a master tag. + + >>> GradedText("").is_wl() + True + >>> GradedText("").is_wl() + False + """ + if self.is_xml(): + e = self._edoc + # FIXME: there could be comments + ret = e.tag == self.ROOT and ( + len(e) == 1 and e[0].tag in MASTERS or + len(e) == 2 and e[0].tag == self.RDF + and e[1].tag in MASTERS) + if ret: + self._master = e[-1].tag + del self._edoc + return ret + else: + return False + + @obj_memoized + def is_broken_wl(self): + """ + Determines if it at least looks like broken WL file + and not just some untagged text. + + >>> GradedText("<").is_broken_wl() + True + >>> GradedText("some text").is_broken_wl() + False + """ + if self.is_wl(): + return True + text = self._text.strip() + return text.startswith('') and text.endswith('') def master(self): + """ + Gets the master tag. + + >>> GradedText("").master() + 'powiesc' + """ assert self.is_wl() return self._master + @obj_memoized + def has_trim_begin(self): + return RE_TRIM_BEGIN.search(self._text) + + @obj_memoized + def has_trim_end(self): + return RE_TRIM_END.search(self._text) + def _trim(text, trim_begin=True, trim_end=True): """ @@ -87,3 +148,57 @@ def compile_text(parts): # only trim beginning if it's not still the first non-empty texts.append(_trim(text, trim_begin=trim_begin, trim_end=False)) return "".join(texts) + + +def change_master(text, master): + """ + Changes the master tag in a WL document. + """ + e = etree.fromstring(text) + e[-1].tag = master + return etree.tostring(e, encoding="utf-8") + + +def basic_structure(text, master): + e = etree.fromstring(''' + + + +''' % (TRIM_BEGIN, TRIM_END)) + e[0].tag = master + e[0][0].tail = "\n"*3 + text + "\n"*3 + return etree.tostring(e, encoding="utf-8") + + +def add_trim_begin(text): + trim_tag = etree.Comment(TRIM_BEGIN) + e = etree.fromstring(text) + for master in e[::-1]: + if master.tag in MASTERS: + break + if master.tag not in MASTERS: + raise ParseError('No master tag found!') + + master.insert(0, trim_tag) + trim_tag.tail = '\n\n\n' + (master.text or '') + master.text = '\n' + return etree.tostring(e, encoding="utf-8") + + +def add_trim_end(text): + trim_tag = etree.Comment(TRIM_END) + e = etree.fromstring(text) + for master in e[::-1]: + if master.tag in MASTERS: + break + if master.tag not in MASTERS: + raise ParseError('No master tag found!') + + master.append(trim_tag) + trim_tag.tail = '\n' + prev = trim_tag.getprevious() + if prev is not None: + prev.tail = (prev.tail or '') + '\n\n\n' + else: + master.text = (master.text or '') + '\n\n\n' + return etree.tostring(e, encoding="utf-8")