X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/577bb2b84d936d65bf072ef0a6b898db9d6e77ab..2f9c60b76f3ab4e69d794a6bb14388a81ff29eb7:/apps/catalogue/xml_tools.py diff --git a/apps/catalogue/xml_tools.py b/apps/catalogue/xml_tools.py new file mode 100755 index 00000000..928e57be --- /dev/null +++ b/apps/catalogue/xml_tools.py @@ -0,0 +1,204 @@ +from functools import wraps +import re + +from lxml import etree +from catalogue.constants import TRIM_BEGIN, TRIM_END, MASTERS + +RE_TRIM_BEGIN = re.compile("^$" % TRIM_BEGIN, re.M) +RE_TRIM_END = re.compile("^$" % TRIM_END, re.M) + + +class ParseError(BaseException): + pass + + +def obj_memoized(f): + """ + A decorator that caches return value of object methods. + The cache is kept with the object, in a _obj_memoized property. + """ + @wraps(f) + def wrapper(self, *args, **kwargs): + if not hasattr(self, '_obj_memoized'): + self._obj_memoized = {} + key = (f.__name__,) + args + tuple(sorted(kwargs.iteritems())) + try: + return self._obj_memoized[key] + except TypeError: + return f(self, *args, **kwargs) + except KeyError: + self._obj_memoized[key] = f(self, *args, **kwargs) + return self._obj_memoized[key] + return wrapper + + +class GradedText(object): + _edoc = None + + ROOT = 'utwor' + RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF' + + def __init__(self, text): + self._text = text + + @obj_memoized + def is_xml(self): + """ + Determines if it's a well-formed XML. + + >>> GradedText("").is_xml() + True + >>> GradedText("").is_xml() + False + """ + try: + self._edoc = etree.fromstring(self._text) + except etree.XMLSyntaxError: + return False + return True + + @obj_memoized + def is_wl(self): + """ + Determines if it's an XML with a and a master tag. + + >>> GradedText("").is_wl() + True + >>> GradedText("").is_wl() + False + """ + if self.is_xml(): + e = self._edoc + # FIXME: there could be comments + ret = e.tag == self.ROOT and ( + len(e) == 1 and e[0].tag in MASTERS or + len(e) == 2 and e[0].tag == self.RDF + and e[1].tag in MASTERS) + if ret: + self._master = e[-1].tag + del self._edoc + return ret + else: + return False + + @obj_memoized + def is_broken_wl(self): + """ + Determines if it at least looks like broken WL file + and not just some untagged text. + + >>> GradedText("<").is_broken_wl() + True + >>> GradedText("some text").is_broken_wl() + False + """ + if self.is_wl(): + return True + text = self._text.strip() + return text.startswith('') and text.endswith('') + + def master(self): + """ + Gets the master tag. + + >>> GradedText("").master() + 'powiesc' + """ + assert self.is_wl() + return self._master + + @obj_memoized + def has_trim_begin(self): + return RE_TRIM_BEGIN.search(self._text) + + @obj_memoized + def has_trim_end(self): + return RE_TRIM_END.search(self._text) + + +def _trim(text, trim_begin=True, trim_end=True): + """ + Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so + that eg. one big XML file can be compiled from many small XML files. + """ + if trim_begin: + text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1] + if trim_end: + text = RE_TRIM_END.split(text, maxsplit=1)[0] + return text + + +def compile_text(parts): + """ + Compiles full text from an iterable of parts, + trimming where applicable. + """ + texts = [] + trim_begin = False + text = '' + for next_text in parts: + if not next_text: + continue + # trim the end, because there's more non-empty text + # don't trim beginning, if `text' is the first non-empty part + texts.append(_trim(text, trim_begin=trim_begin)) + trim_begin = True + text = next_text + # don't trim the end, because there's no more text coming after `text' + # only trim beginning if it's not still the first non-empty + texts.append(_trim(text, trim_begin=trim_begin, trim_end=False)) + return "".join(texts) + + +def change_master(text, master): + """ + Changes the master tag in a WL document. + """ + e = etree.fromstring(text) + e[-1].tag = master + return etree.tostring(e, encoding="utf-8") + + +def basic_structure(text, master): + e = etree.fromstring(''' + + + +''' % (TRIM_BEGIN, TRIM_END)) + e[0].tag = master + e[0][0].tail = "\n"*3 + text + "\n"*3 + return etree.tostring(e, encoding="utf-8") + + +def add_trim_begin(text): + trim_tag = etree.Comment(TRIM_BEGIN) + e = etree.fromstring(text) + for master in e[::-1]: + if master.tag in MASTERS: + break + if master.tag not in MASTERS: + raise ParseError('No master tag found!') + + master.insert(0, trim_tag) + trim_tag.tail = '\n\n\n' + (master.text or '') + master.text = '\n' + return etree.tostring(e, encoding="utf-8") + + +def add_trim_end(text): + trim_tag = etree.Comment(TRIM_END) + e = etree.fromstring(text) + for master in e[::-1]: + if master.tag in MASTERS: + break + if master.tag not in MASTERS: + raise ParseError('No master tag found!') + + master.append(trim_tag) + trim_tag.tail = '\n' + prev = trim_tag.getprevious() + if prev is not None: + prev.tail = (prev.tail or '') + '\n\n\n' + else: + master.text = (master.text or '') + '\n\n\n' + return etree.tostring(e, encoding="utf-8")