apps/catalogue/management/edumed.py

   1 # EduMed auto-tagger
   2 # -*- coding: utf-8 -*-
   3 import re
   4 from slughifi import slughifi
   5
   6
   7 class Tagger:
   8     def __init__(self, state, lines):
   9         self.state = state
  10         self.lines = lines
  11
  12     def spawn(self, cls):
  13         return cls(self.state, self.lines)
  14
  15     def line(self, position):
  16         return self.lines[position]
  17
  18     empty_line = re.compile(r"\s+")
  19
  20     def skip_empty(self, position):
  21         while self.line(position) == "" or \
  22             self.empty_line.match(self.line(position)):
  23             position += 1
  24         return position
  25
  26     def tag(self, position):
  27         """
  28 Return None -- means that we can't tag it in any way
  29         """
  30         return None
  31
  32     def wrap(self, tagname, content):
  33         return u"<%s>%s</%s>" % (tagname, content, tagname)
  34
  35
  36 class Section(Tagger):
  37     looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
  38
  39     def tag(self, pos):
  40         pos2 = self.skip_empty(pos)
  41         pos = pos2
  42         m = self.looks_like.match(self.line(pos))
  43         if m:
  44             self.title = m.groups()[0]
  45             return pos + 1
  46
  47     def __unicode__(self):
  48         return self.wrap("naglowek_rozdzial", self.title)
  49
  50
  51 class Meta(Tagger):
  52     looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
  53
  54     def tag(self, pos):
  55         pos = self.skip_empty(pos)
  56         m = self.looks_like.match(self.line(pos))
  57         if m:
  58             k = m.groups()[0]
  59             v = m.groups()[1]
  60             m = self.state.get('meta', {})
  61             m[k] = v
  62             self.state['meta'] = m
  63             return pos + 1
  64
  65
  66 class Informacje(Tagger):
  67     def tag(self, pos):
  68         self.title = self.spawn(Section)
  69         self.meta = []
  70         pos = self.title.tag(pos)
  71         if pos is None: return
  72
  73             # collect meta
  74         while True:
  75             pos = self.skip_empty(pos)
  76             meta = self.spawn(Meta)
  77             pos2 = meta.tag(pos)
  78             if pos2 is None: break
  79             self.meta.append(meta)
  80             pos = pos2
  81
  82         return pos
  83
  84
  85 class List(Tagger):
  86     def tag(self, pos):
  87         self.items = []
  88         while True:
  89             l = self.line(pos)
  90             if l and l[0] in ('-', '*'):
  91                 self.items.append(l[1:].strip())
  92                 pos += 1
  93             else:
  94                 break
  95         if self.items:
  96             return pos
  97
  98     def __unicode__(self):
  99         s = "<lista>\n"
 100         for i in self.items:
 101             s += "<punkt>%s</punkt>\n" % i
 102         s += "</lista>\n"
 103         return s
 104
 105
 106 class Paragraph(Tagger):
 107     remove_this = [
 108         re.compile(r"[\s]*opis zawartości[\s]*", re.I),
 109         re.compile(r"^[\s]*$")
 110         ]
 111     podrozdzial = [
 112         re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
 113         ]
 114     def tag(self, pos):
 115         self.line = self.lines[pos]
 116         self.ignore = False
 117         self.is_podrozdzial = False
 118
 119         for x in self.remove_this:
 120             if x.match(self.line):
 121                 self.ignore = True
 122
 123         for x in self.podrozdzial:
 124             if x.match(self.line):
 125                 self.is_podrozdzial = True
 126
 127         return pos + 1
 128
 129     def __unicode__(self):
 130         if not self.ignore:
 131             if self.is_podrozdzial:
 132                 tag = 'naglowek_podrozdzial'
 133             else:
 134                 tag = 'akap'
 135             return u"<%s>%s</%s>" % (tag, self.line, tag)
 136         else:
 137             return u''
 138
 139
 140 class Container:
 141     def __init__(self, tag_name, *elems):
 142         self.tag_name = tag_name
 143         self.elems = elems
 144
 145     def __unicode__(self):
 146         s = u"<%s>" % self.tag_name
 147         add_nl = False
 148         for e in self.elems:
 149             if isinstance(e, (str, unicode)):
 150                 s += unicode(e)
 151             else:
 152                 s += "\n  " + unicode(e)
 153                 add_nl = True
 154
 155         if add_nl: s += "\n"
 156         s += u"</%s>" % self.tag_name
 157         return s
 158
 159
 160 def eatany(pos, *taggers):
 161     try:
 162         for t in list(taggers):
 163             p = t.tag(pos)
 164             if p:
 165                 return (t, p)
 166     except IndexError:
 167         pass
 168     return (None, pos)
 169
 170
 171 def eatseq(pos, *taggers):
 172     good = []
 173     taggers = list(taggers[:])
 174     try:
 175         while len(taggers):
 176             p = taggers[0].tag(pos)
 177             if p is None:
 178                 return (tuple(good), pos)
 179             good.append(taggers.pop(0))
 180             # print "%d -> %d" % (pos, p)
 181             pos = p
 182
 183     except IndexError:
 184         print "Got index error for pos=%d" % pos
 185     return (tuple(good), pos)
 186
 187
 188 def tagger(text):
 189     """
 190 tagger(text) function name and signature is a contract.
 191 returns auto-tagged text
 192     """
 193     if not isinstance(text, unicode):
 194         text = unicode(text.decode('utf-8'))
 195     lines = text.split("\n")
 196     pos = 0
 197     content = []
 198     state = {}
 199     info = Informacje(state, lines)
 200
 201     ((info,), pos) = eatseq(pos, info)
 202
 203     # print "[i] %d. %s" % (pos, lines[pos])
 204
 205     content.append(info)
 206
 207     while True:
 208         x, pos = eatany(pos, info.spawn(Section),
 209                         info.spawn(List), info.spawn(Paragraph))
 210
 211         if x is not None:
 212             content.append(x)
 213         else:
 214             content.append(lines[pos])
 215             pos += 1
 216             if pos >= len(lines):
 217                 break
 218
 219     return toxml(content)
 220
 221 dc_fixed = {
 222     'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
 223     'relation':  u'moduły powiązane linki',
 224     'description.material': u'linki do załączników',
 225     'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
 226     }
 227
 228
 229 def mark_activities(content):
 230     i = 0
 231     tl = len(content)
 232     is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
 233     #    import pdb; pdb.set_trace()
 234     is_next_section = re.compile(r"^[IVX]+[.]? ")
 235     is_activity = re.compile(r"^[0-9]+[.]? (.+)")
 236
 237     is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
 238     is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
 239     is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
 240     activity_props = {
 241         'pomoce': is_activity_tools,
 242         'forma': is_activity_work,
 243         'czas': is_activity_time
 244         }
 245     activities = []
 246
 247     in_activities = False
 248     ab = -1
 249     ae = -1
 250     while True:
 251         e = content[i]
 252         if isinstance(e, Paragraph):
 253             if not in_activities and \
 254                 is_przebieg.match(e.line):
 255                 in_activities = True
 256
 257             if in_activities and \
 258                 is_next_section.match(e.line):
 259                 in_activities = False
 260             if in_activities:
 261                 m = is_activity.match(e.line)
 262                 if m:
 263                     e.line = m.groups()[0]
 264                     ab = i
 265                 if is_activity_time.match(e.line):
 266                     ae = i + 1
 267                     activities.append((ab, ae))
 268         i += 1
 269         if i >= tl: break
 270
 271     activities.reverse()
 272     for ab, ae in activities:
 273         act_len = ae - ab
 274         info_start = ae
 275
 276         act_els = []
 277         act_els.append(Container("opis", content[ab]))
 278         for i in range(ab, ae):
 279             e = content[i]
 280             if isinstance(e, Paragraph):
 281                 for prop, pattern in activity_props.items():
 282                     m = pattern.match(e.line)
 283                     if m:
 284                         act_els.append(Container(prop, m.groups()[0]))
 285                         if info_start > i: info_start = i
 286         act_els.insert(1, Container('wskazowki',
 287                                     *content[ab + 1:info_start]))
 288         content[ab:ae] = [Container('aktywnosc', *act_els)]
 289     return content
 290
 291
 292 def mark_dictionary(content):
 293     db = -1
 294     de = -1
 295     i = 0
 296     is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
 297     is_dictentry = re.compile(r"([^-]+) - (.+)")
 298     slowniczek = []
 299     while i < len(content):
 300         e = content[i]
 301         if isinstance(e, Section):
 302             if is_dictionary.match(e.title):
 303                 db = i + 1
 304             elif db >= 1:
 305                 de = i
 306                 content[db:de] = [Container('slowniczek', *slowniczek)]
 307         elif db >= 0:
 308             if isinstance(e, Paragraph):
 309                 m = is_dictentry.match(e.line)
 310                 if m:
 311                     slowniczek.append(Container('definiendum', m.groups()[0]))
 312                     slowniczek.append(Container('definiens', m.groups()[1]))
 313                 else:
 314                     slowniczek.append(e)
 315         i += 1
 316
 317     return content
 318
 319
 320 def toxml(content):
 321     content = mark_activities(content)
 322     content = mark_dictionary(content)
 323     info = content.pop(0)
 324
 325     state = info.state
 326     meta = state['meta']
 327     slug = slughifi(meta.get(u'Tytuł modułu', ''))
 328     holder = {}
 329     holder['xml'] = u""
 330
 331     def p(t):
 332         holder['xml'] += u"%s\n" % t
 333
 334     def dc(k, v):
 335         p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
 336
 337     def t(tag, ct):
 338         p(u'<%s>%s</%s>' % (tag, ct, tag))
 339
 340     def a(ct):
 341         if ct:
 342             t(u'akap', ct)
 343
 344     p("<utwor>")
 345     p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
 346     p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
 347     authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
 348     for author in authors:
 349         names = author.split(u' ')
 350         lastname = names.pop()
 351         names.insert(0, lastname + ",")
 352         author = u' '.join(names)
 353         dc(u'creator', author)
 354     dc(u'title', meta.get(u'Tytuł modułu', u''))
 355     dc(u'relation.isPartOf', meta.get(u'Dział', u''))
 356     dc(u'publisher', u'Fundacja Nowoczesna Polska')
 357     dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
 358     dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
 359     for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
 360         keyword = keyword.strip()
 361         dc(u'subject', keyword)
 362     dc(u'description', dc_fixed['description'])
 363     dc(u'description.material', dc_fixed['description.material'])
 364     dc(u'relation', dc_fixed['relation'])
 365     dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
 366     dc(u'rights', dc_fixed['rights'])
 367     dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
 368     dc(u'format', u'xml')
 369     dc(u'type', u'text')
 370     dc(u'date', u'2012-11-09')  # TODO
 371     dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
 372     dc(u'language', u'pol')
 373     p(u'</rdf:Description>')
 374     p(u'</rdf:RDF>')
 375
 376     p(u'<powiesc>')
 377     t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
 378     p(u'<nota>')
 379     a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
 380     p(u'</nota>')
 381
 382     p(unicode(info.title))
 383     for elm in content:
 384         if isinstance(elm, unicode) or isinstance(elm, str):
 385             a(elm)
 386             continue
 387         p(unicode(elm))
 388
 389     p(u'</powiesc>')
 390     p(u'</utwor>')
 391
 392     return holder['xml']
 393
 394
 395 # TODO / TBD
 396 # ogarnąć podrozdziały
 397 #  Przebieg zajęć
 398 #  opcje dodatkowe
 399 # usunąć 'opis zawartości'
 400 # akapit łączony?