Edumed import from pad + auto tagging

[redakcja.git] / apps / catalogue / management / edumed.py
diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py

new file mode 100644 (file)

index 0000000..e5d5ee0
--- /dev/null
+++ b/apps/catalogue/management/edumed.py
@@ -0,0 +1,400 @@
+# EduMed auto-tagger
+# -*- coding: utf-8 -*-
+import re
+from slughifi import slughifi
+
+
+class Tagger:
+    def __init__(self, state, lines):
+        self.state = state
+        self.lines = lines
+
+    def spawn(self, cls):
+        return cls(self.state, self.lines)
+
+    def line(self, position):
+        return self.lines[position]
+
+    empty_line = re.compile(r"\s+")
+    
+    def skip_empty(self, position):
+        while self.line(position) == "" or \
+            self.empty_line.match(self.line(position)):
+            position += 1
+        return position
+
+    def tag(self, position):
+        """
+Return None -- means that we can't tag it in any way
+        """
+        return None
+
+    def wrap(self, tagname, content):
+        return u"<%s>%s</%s>" % (tagname, content, tagname)
+
+
+class Section(Tagger):
+    looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
+
+    def tag(self, pos):
+        pos2 = self.skip_empty(pos)
+        pos = pos2
+        m = self.looks_like.match(self.line(pos))
+        if m:
+            self.title = m.groups()[0]
+            return pos + 1
+
+    def __unicode__(self):
+        return self.wrap("naglowek_rozdzial", self.title)
+
+
+class Meta(Tagger):
+    looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
+
+    def tag(self, pos):
+        pos = self.skip_empty(pos)
+        m = self.looks_like.match(self.line(pos))
+        if m:
+            k = m.groups()[0]
+            v = m.groups()[1]
+            m = self.state.get('meta', {})
+            m[k] = v
+            self.state['meta'] = m
+            return pos + 1
+
+
+class Informacje(Tagger):
+    def tag(self, pos):
+        self.title = self.spawn(Section)
+        self.meta = []
+        pos = self.title.tag(pos)
+        if pos is None: return
+
+            # collect meta
+        while True:
+            pos = self.skip_empty(pos)
+            meta = self.spawn(Meta)
+            pos2 = meta.tag(pos)
+            if pos2 is None: break
+            self.meta.append(meta)
+            pos = pos2
+
+        return pos
+
+
+class List(Tagger):
+    def tag(self, pos):
+        self.items = []
+        while True:
+            l = self.line(pos)
+            if l and l[0] in ('-', '*'):
+                self.items.append(l[1:].strip())
+                pos += 1
+            else:
+                break
+        if self.items:
+            return pos
+
+    def __unicode__(self):
+        s = "<lista>\n"
+        for i in self.items:
+            s += "<punkt>%s</punkt>\n" % i
+        s += "</lista>\n"
+        return s
+
+
+class Paragraph(Tagger):
+    remove_this = [
+        re.compile(r"[\s]*opis zawartości[\s]*", re.I),
+        re.compile(r"^[\s]*$")
+        ]
+    podrozdzial = [
+        re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
+        ]
+    def tag(self, pos):
+        self.line = self.lines[pos]
+        self.ignore = False
+        self.is_podrozdzial = False
+        
+        for x in self.remove_this:
+            if x.match(self.line):
+                self.ignore = True
+                
+        for x in self.podrozdzial:
+            if x.match(self.line):
+                self.is_podrozdzial = True
+                
+        return pos + 1
+
+    def __unicode__(self):
+        if not self.ignore:
+            if self.is_podrozdzial:
+                tag = 'naglowek_podrozdzial'
+            else:
+                tag = 'akap'
+            return u"<%s>%s</%s>" % (tag, self.line, tag)
+        else:
+            return u''
+
+
+class Container:
+    def __init__(self, tag_name, *elems):
+        self.tag_name = tag_name
+        self.elems = elems
+
+    def __unicode__(self):
+        s = u"<%s>" % self.tag_name
+        add_nl = False
+        for e in self.elems:
+            if isinstance(e, (str, unicode)):
+                s += unicode(e)
+            else:
+                s += "\n  " + unicode(e)
+                add_nl = True
+
+        if add_nl: s += "\n"
+        s += u"</%s>" % self.tag_name
+        return s
+
+
+def eatany(pos, *taggers):
+    try:
+        for t in list(taggers):
+            p = t.tag(pos)
+            if p:
+                return (t, p)
+    except IndexError:
+        pass
+    return (None, pos)
+
+
+def eatseq(pos, *taggers):
+    good = []
+    taggers = list(taggers[:])
+    try:
+        while len(taggers):
+            p = taggers[0].tag(pos)
+            if p is None:
+                return (tuple(good), pos)
+            good.append(taggers.pop(0))
+            # print "%d -> %d" % (pos, p)
+            pos = p
+
+    except IndexError:
+        print "Got index error for pos=%d" % pos
+    return (tuple(good), pos)
+
+
+def tagger(text):
+    """
+tagger(text) function name and signature is a contract.
+returns auto-tagged text
+    """
+    if not isinstance(text, unicode):
+        text = unicode(text.decode('utf-8'))
+    lines = text.split("\n")
+    pos = 0
+    content = []
+    state = {}
+    info = Informacje(state, lines)
+    
+    ((info,), pos) = eatseq(pos, info)
+
+    # print "[i] %d. %s" % (pos, lines[pos])
+
+    content.append(info)
+
+    while True:
+        x, pos = eatany(pos, info.spawn(Section),
+                        info.spawn(List), info.spawn(Paragraph))
+
+        if x is not None:
+            content.append(x)
+        else:
+            content.append(lines[pos])
+            pos += 1
+            if pos >= len(lines):
+                break
+
+    return toxml(content)
+
+dc_fixed = {
+    'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
+    'relation':  u'moduły powiązane linki',
+    'description.material': u'linki do załączników',
+    'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
+    }
+
+
+def mark_activities(content):
+    i = 0
+    tl = len(content)
+    is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
+    #    import pdb; pdb.set_trace()
+    is_next_section = re.compile(r"^[IVX]+[.]? ")
+    is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+
+    is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
+    is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
+    is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
+    activity_props = {
+        'pomoce': is_activity_tools,
+        'forma': is_activity_work,
+        'czas': is_activity_time
+        }
+    activities = []
+
+    in_activities = False
+    ab = -1
+    ae = -1
+    while True:
+        e = content[i]
+        if isinstance(e, Paragraph):
+            if not in_activities and \
+                is_przebieg.match(e.line):
+                in_activities = True
+
+            if in_activities and \
+                is_next_section.match(e.line):
+                in_activities = False
+            if in_activities:
+                m = is_activity.match(e.line)
+                if m:
+                    e.line = m.groups()[0]
+                    ab = i
+                if is_activity_time.match(e.line):
+                    ae = i + 1
+                    activities.append((ab, ae))
+        i += 1
+        if i >= tl: break
+
+    activities.reverse()
+    for ab, ae in activities:
+        act_len = ae - ab
+        info_start = ae
+
+        act_els = []
+        act_els.append(Container("opis", content[ab]))
+        for i in range(ab, ae):
+            e = content[i]
+            if isinstance(e, Paragraph):
+                for prop, pattern in activity_props.items():
+                    m = pattern.match(e.line)
+                    if m:
+                        act_els.append(Container(prop, m.groups()[0]))
+                        if info_start > i: info_start = i
+        act_els.insert(1, Container('wskazowki',
+                                    *content[ab + 1:info_start]))
+        content[ab:ae] = [Container('aktywnosc', *act_els)]
+    return content
+
+
+def mark_dictionary(content):
+    db = -1
+    de = -1
+    i = 0
+    is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
+    is_dictentry = re.compile(r"([^-]+) - (.+)")
+    slowniczek = []
+    while i < len(content):
+        e = content[i]
+        if isinstance(e, Section):
+            if is_dictionary.match(e.title):
+                db = i + 1
+            elif db >= 1:
+                de = i
+                content[db:de] = [Container('slowniczek', *slowniczek)]
+        elif db >= 0:
+            if isinstance(e, Paragraph):
+                m = is_dictentry.match(e.line)
+                if m:
+                    slowniczek.append(Container('definiendum', m.groups()[0]))
+                    slowniczek.append(Container('definiens', m.groups()[1]))
+                else:
+                    slowniczek.append(e)
+        i += 1
+
+    return content
+
+
+def toxml(content):
+    content = mark_activities(content)
+    content = mark_dictionary(content)
+    info = content.pop(0)
+
+    state = info.state
+    meta = state['meta']
+    slug = slughifi(meta.get(u'Tytuł modułu', ''))
+    holder = {}
+    holder['xml'] = u""
+
+    def p(t):
+        holder['xml'] += u"%s\n" % t
+
+    def dc(k, v):
+        p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
+
+    def t(tag, ct):
+        p(u'<%s>%s</%s>' % (tag, ct, tag))
+
+    def a(ct):
+        if ct:
+            t(u'akap', ct)
+
+    p("<utwor>")
+    p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
+    p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
+    authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
+    for author in authors:
+        names = author.split(u' ')
+        lastname = names.pop()
+        names.insert(0, lastname + ",")
+        author = u' '.join(names)
+        dc(u'creator', author)
+    dc(u'title', meta.get(u'Tytuł modułu', u''))
+    dc(u'relation.isPartOf', meta.get(u'Dział', u''))
+    dc(u'publisher', u'Fundacja Nowoczesna Polska')
+    dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
+    dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
+    for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
+        keyword = keyword.strip()
+        dc(u'subject', keyword)
+    dc(u'description', dc_fixed['description'])
+    dc(u'description.material', dc_fixed['description.material'])
+    dc(u'relation', dc_fixed['relation'])
+    dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
+    dc(u'rights', dc_fixed['rights'])
+    dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
+    dc(u'format', u'xml')
+    dc(u'type', u'text')
+    dc(u'date', u'2012-11-09')  # TODO
+    dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
+    dc(u'language', u'pol')
+    p(u'</rdf:Description>')
+    p(u'</rdf:RDF>')
+
+    p(u'<powiesc>')
+    t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
+    p(u'<nota>')
+    a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
+    p(u'</nota>')
+
+    p(unicode(info.title))
+    for elm in content:
+        if isinstance(elm, unicode) or isinstance(elm, str):
+            a(elm)
+            continue
+        p(unicode(elm))
+
+    p(u'</powiesc>')
+    p(u'</utwor>')
+
+    return holder['xml']
+
+
+# TODO / TBD
+# ogarnąć podrozdziały
+#  Przebieg zajęć
+#  opcje dodatkowe
+# usunąć 'opis zawartości'
+# akapit łączony?