apps/wiki/xml_tools.py

   1 import re
   2
   3 from lxml import etree
   4
   5 from wiki.constants import RE_TRIM_BEGIN, RE_TRIM_END
   6
   7 class GradedText(object):
   8     _is_xml = None
   9     _edoc = None
  10     _is_wl = None
  11     _master = None
  12
  13     ROOT = 'utwor'
  14     MASTERS = ['powiesc',
  15                'opowiadanie',
  16                'liryka_l',
  17                'liryka_lp',
  18                'dramat_wierszowany_l',
  19                'dramat_wierszowany_lp',
  20                'dramat_wspolczesny',
  21                ]
  22     RDF = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF'
  23
  24     def __init__(self, text):
  25         self._text = text
  26
  27     def is_xml(self):
  28         if self._is_xml is None:
  29             try:
  30                 self._edoc = etree.fromstring(self._text)
  31             except etree.XMLSyntaxError:
  32                 self._is_xml = False
  33             else:
  34                 self._is_xml = True
  35             del self._text
  36         return self._is_xml
  37
  38     def is_wl(self):
  39         if self._is_wl is None:
  40             if self.is_xml():
  41                 e = self._edoc
  42                 self._is_wl = e.tag == self.ROOT and (
  43                     len(e) == 1 and e[0].tag in self.MASTERS or
  44                     len(e) == 2 and e[0].tag == self.RDF
  45                         and e[1].tag in self.MASTERS)
  46                 if self._is_wl:
  47                     self._master = e[-1].tag
  48                 del self._edoc
  49             else:
  50                 self._is_wl = False
  51         return self._is_wl
  52
  53     def master(self):
  54         assert self.is_wl()
  55         return self._master
  56
  57
  58 def _trim(text, trim_begin=True, trim_end=True):
  59     """
  60         Cut off everything before RE_TRIM_BEGIN and after RE_TRIM_END, so
  61         that eg. one big XML file can be compiled from many small XML files.
  62     """
  63     if trim_begin:
  64         text = RE_TRIM_BEGIN.split(text, maxsplit=1)[-1]
  65     if trim_end:
  66         text = RE_TRIM_END.split(text, maxsplit=1)[0]
  67     return text
  68
  69
  70 def compile_text(parts):
  71     """
  72         Compiles full text from an iterable of parts,
  73         trimming where applicable.
  74     """
  75     texts = []
  76     trim_begin = False
  77     text = ''
  78     for next_text in parts:
  79         if not next_text:
  80             continue
  81         # trim the end, because there's more non-empty text
  82         # don't trim beginning, if `text' is the first non-empty part
  83         texts.append(_trim(text, trim_begin=trim_begin))
  84         trim_begin = True
  85         text = next_text
  86     # don't trim the end, because there's no more text coming after `text'
  87     # only trim beginning if it's not still the first non-empty
  88     texts.append(_trim(text, trim_begin=trim_begin, trim_end=False))
  89     return "".join(texts)