X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/e2d4fbf230c90dcf3e904351a70d399426478352..b6b62a76505574e1961e15900a96a70a5461f108:/apps/catalogue/management/edumed.py?ds=inline diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py index ffad2ec1..05034368 100644 --- a/apps/catalogue/management/edumed.py +++ b/apps/catalogue/management/edumed.py @@ -4,7 +4,7 @@ import re from slughifi import slughifi -class Tagger: +class Tagger(object): def __init__(self, state, lines): self.state = state self.lines = lines @@ -15,9 +15,9 @@ class Tagger: def line(self, position): return self.lines[position] - ignore = [ re.compile(r"^[\[][PA][\]] - [^ ]+$") ] + ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")] empty_line = re.compile(r"^\s+$") - + def skip_empty(self, position): while self.line(position) == "" or \ self.empty_line.match(self.line(position)) or \ @@ -26,7 +26,6 @@ class Tagger: position += 1 return position - def tag(self, position): """ Return None -- means that we can't tag it in any way @@ -36,10 +35,18 @@ Return None -- means that we can't tag it in any way def wrap(self, tagname, content): return u"<%s>%s" % (tagname, content, tagname) + @staticmethod + def anymatches(regex): + return lambda x: regex.match(x) + class Section(Tagger): looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$") + def __init__(self, *a): + super(Section, self).__init__(*a) + self.is_podrozdzial = False + def tag(self, pos): pos2 = self.skip_empty(pos) pos = pos2 @@ -49,7 +56,8 @@ class Section(Tagger): return pos + 1 def __unicode__(self): - return self.wrap("naglowek_rozdzial", self.title) + return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial", + self.title) class Meta(Tagger): @@ -87,47 +95,68 @@ class Informacje(Tagger): class List(Tagger): - def tag(self, pos): + point = re.compile(r"^[\s]*[-*·]{1,2}(.*)") + num = re.compile(r"^[\s]*[a-z][.]\s+(.*)") + + def __init__(self, *args): + + super(List, self).__init__(*args) self.items = [] + self.type = 'punkt' + + def tag(self, pos): while True: l = self.line(pos) - if l and l[0] in ('-', '*'): - self.items.append(l[1:].strip()) + m = self.point.match(l) + if not m: + m = self.num.match(l) + if m: self.type = 'num' + if l and m: + self.items.append(m.groups()[0].lstrip()) pos += 1 else: break if self.items: return pos + def append(self, tagger): + self.items.append(tagger) + def __unicode__(self): - s = "\n" + s = '' % self.type for i in self.items: - s += "%s\n" % i - s += "\n" + if isinstance(i, list): + x = "\n".join(map(lambda elem: unicode(elem), i)) + else: + x = unicode(i) + s += "\n%s" % x + s += "\n\n" return s class Paragraph(Tagger): remove_this = [ re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I), - re.compile(r"^[\s]*$") + re.compile(r"^[\s]*$"), + re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek") ] podrozdzial = [ - re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I), + re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I), ] + def tag(self, pos): self.line = self.lines[pos] self.ignore = False self.is_podrozdzial = False - + for x in self.remove_this: if x.match(self.line): self.ignore = True - + for x in self.podrozdzial: if x.match(self.line): self.is_podrozdzial = True - + return pos + 1 def __unicode__(self): @@ -189,7 +218,7 @@ def eatseq(pos, *taggers): return (tuple(good), pos) -def tagger(text): +def tagger(text, pretty_print=False): """ tagger(text) function name and signature is a contract. returns auto-tagged text @@ -201,7 +230,7 @@ returns auto-tagged text content = [] state = {} info = Informacje(state, lines) - + ((info,), pos) = eatseq(pos, info) # print "[i] %d. %s" % (pos, lines[pos]) @@ -220,23 +249,63 @@ returns auto-tagged text if pos >= len(lines): break - return toxml(content) + return toxml(content, pretty_print=pretty_print) dc_fixed = { 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).', - 'relation': u'moduły powiązane linki', + 'relation': u'moduły powiązane linki', 'description.material': u'linki do załączników', 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0', } +class NotFound(Exception): + pass + + +def find_block(content, title_re, begin=-1, end=-1): + title_re = re.compile(title_re, re.I | re.UNICODE) + + rb = -1 + if begin < 0: begin = 0 + if end < 0: end = len(content) + + for i in range(begin, end): + elem = content[i] + if isinstance(elem, Paragraph): + if title_re.match(elem.line): + rb = i + continue + if isinstance(elem, Section): + if title_re.match(elem.title): + rb = i + continue + if rb >= 0: + if isinstance(elem, List): + continue + if isinstance(elem, Paragraph) and elem.line: + continue + break + if rb >= 0: + return rb, i + raise NotFound() + + +def remove_block(content, title_re, removed=None): + rb, re = find_block(content, title_re) + if removed is not None and isinstance(removed, list): + removed += content[rb:re][:] + content[rb:re] = [] + return content + + def mark_activities(content): i = 0 tl = len(content) is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I) - # import pdb; pdb.set_trace() + is_next_section = re.compile(r"^[IVX]+[.]? ") - is_activity = re.compile(r"^[0-9]+[.]? (.+)") + is_activity = re.compile(r"^[0-9]+[.] (.+)") is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)") is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)") @@ -253,14 +322,16 @@ def mark_activities(content): ae = -1 while True: e = content[i] + if isinstance(e, Section): + if in_activities and \ + is_next_section.match(e.title): + in_activities = False + if isinstance(e, Paragraph): if not in_activities and \ is_przebieg.match(e.line): in_activities = True - if in_activities and \ - is_next_section.match(e.line): - in_activities = False if in_activities: m = is_activity.match(e.line) if m: @@ -299,7 +370,8 @@ def mark_dictionary(content): i = 0 is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I) is_dictentry = re.compile(r"([^-]+) - (.+)") - slowniczek = [] + slowniczek = content[0].spawn(List) + slowniczek.type = 'slowniczek' while i < len(content): e = content[i] if isinstance(e, Section): @@ -307,13 +379,15 @@ def mark_dictionary(content): db = i + 1 elif db >= 1: de = i - content[db:de] = [Container('slowniczek', *slowniczek)] + content[db:de] = [slowniczek] + break elif db >= 0: if isinstance(e, Paragraph): m = is_dictentry.match(e.line) if m: - slowniczek.append(Container('definiendum', m.groups()[0])) - slowniczek.append(Container('definiens', m.groups()[1])) + slowniczek.append([Container('definiendum', m.groups()[0]), + Container('definiens', m.groups()[1])]) + else: slowniczek.append(e) i += 1 @@ -321,9 +395,42 @@ def mark_dictionary(content): return content -def toxml(content): +def move_evaluation(content): + evaluation = [] + + content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation) + if evaluation: + # print "found evaluation %s" % (evaluation,) + evaluation[0].is_podrozdzial = True + # evaluation place + opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*") + if opcje_dodatkowe: + # print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, ) + content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation + else: + materialy = find_block(content, r"materia.y[+ AP\[\].]*") + if materialy: + # print "putting evaluation just before materialy @ %s" % (materialy, ) + content[materialy[0]:materialy[0]] = evaluation + else: + print "er.. no idea where to place evaluation" + return content + + +def toxml(content, pretty_print=False): + # some transformations content = mark_activities(content) content = mark_dictionary(content) + try: + content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*") + except NotFound: + pass + try: + content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*") + except NotFound: + pass + content = move_evaluation(content) + info = content.pop(0) state = info.state @@ -379,9 +486,9 @@ def toxml(content): p(u'') t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u'')) - p(u'') - a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u'')) - p(u'') + # p(u'') + a(u'' % meta.get(u'Numer porządkowy', u'')) + # p(u'') p(unicode(info.title)) for elm in content: @@ -393,6 +500,12 @@ def toxml(content): p(u'') p(u'') + if pretty_print: + from lxml import etree + from StringIO import StringIO + xml = etree.parse(StringIO(holder['xml'])) + holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode) + return holder['xml']