apps/catalogue/management/edumed.py

   1 # EduMed auto-tagger
   2 # -*- coding: utf-8 -*-
   3 import re
   4 from slughifi import slughifi
   5
   6
   7 class Tagger(object):
   8     def __init__(self, state, lines):
   9         self.state = state
  10         self.lines = lines
  11
  12     def spawn(self, cls):
  13         return cls(self.state, self.lines)
  14
  15     def line(self, position):
  16         return self.lines[position]
  17
  18     ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
  19     empty_line = re.compile(r"^\s+$")
  20
  21     def skip_empty(self, position):
  22         while self.line(position) == "" or \
  23             self.empty_line.match(self.line(position)) or \
  24             filter(lambda r: r.match(self.line(position)),
  25                              self.ignore[:]):
  26             position += 1
  27         return position
  28
  29     def tag(self, position):
  30         """
  31 Return None -- means that we can't tag it in any way
  32         """
  33         return None
  34
  35     def wrap(self, tagname, content):
  36         return u"<%s>%s</%s>" % (tagname, content, tagname)
  37
  38     @staticmethod
  39     def anymatches(regex):
  40         return lambda x: regex.match(x)
  41
  42
  43 class Section(Tagger):
  44     looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
  45
  46     def __init__(self, *a):
  47         super(Section, self).__init__(*a)
  48         self.is_podrozdzial = False
  49
  50     def tag(self, pos):
  51         pos2 = self.skip_empty(pos)
  52         pos = pos2
  53         m = self.looks_like.match(self.line(pos))
  54         if m:
  55             self.title = m.groups()[0]
  56             return pos + 1
  57
  58     def __unicode__(self):
  59         return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial",
  60                          self.title)
  61
  62
  63 class Meta(Tagger):
  64     looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
  65
  66     def tag(self, pos):
  67         pos = self.skip_empty(pos)
  68         m = self.looks_like.match(self.line(pos))
  69         if m:
  70             k = m.groups()[0]
  71             v = m.groups()[1]
  72             m = self.state.get('meta', {})
  73             m[k] = v
  74             self.state['meta'] = m
  75             return pos + 1
  76
  77
  78 class Informacje(Tagger):
  79     def tag(self, pos):
  80         self.title = self.spawn(Section)
  81         self.meta = []
  82         pos = self.title.tag(pos)
  83         if pos is None: return
  84
  85             # collect meta
  86         while True:
  87             pos = self.skip_empty(pos)
  88             meta = self.spawn(Meta)
  89             pos2 = meta.tag(pos)
  90             if pos2 is None: break
  91             self.meta.append(meta)
  92             pos = pos2
  93
  94         return pos
  95
  96
  97 class List(Tagger):
  98     point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
  99     num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
 100
 101     def __init__(self, *args):
 102
 103         super(List, self).__init__(*args)
 104         self.items = []
 105         self.type = 'punkt'
 106
 107     def tag(self, pos):
 108         while True:
 109             l = self.line(pos)
 110             m = self.point.match(l)
 111             if not m:
 112                 m = self.num.match(l)
 113                 if m: self.type = 'num'
 114             if l and m:
 115                 self.items.append(m.groups()[0].lstrip())
 116                 pos += 1
 117             else:
 118                 break
 119         if self.items:
 120             return pos
 121
 122     def append(self, tagger):
 123         self.items.append(tagger)
 124
 125     def __unicode__(self):
 126         s = '<lista typ="%s">' % self.type
 127         for i in self.items:
 128             if isinstance(i, list):
 129                 x = "\n".join(map(lambda elem: unicode(elem), i))
 130             else:
 131                 x = unicode(i)
 132             s += "\n<punkt>%s</punkt>" % x
 133         s += "\n</lista>\n"
 134         return s
 135
 136
 137 class Paragraph(Tagger):
 138     remove_this = [
 139         re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
 140         re.compile(r"^[\s]*$"),
 141         re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek")
 142         ]
 143     podrozdzial = [
 144         re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
 145         ]
 146
 147     def tag(self, pos):
 148         self.line = self.lines[pos]
 149         self.ignore = False
 150         self.is_podrozdzial = False
 151
 152         for x in self.remove_this:
 153             if x.match(self.line):
 154                 self.ignore = True
 155
 156         for x in self.podrozdzial:
 157             if x.match(self.line):
 158                 self.is_podrozdzial = True
 159
 160         return pos + 1
 161
 162     def __unicode__(self):
 163         if not self.ignore:
 164             if self.is_podrozdzial:
 165                 tag = 'naglowek_podrozdzial'
 166             else:
 167                 tag = 'akap'
 168             return u"<%s>%s</%s>" % (tag, self.line, tag)
 169         else:
 170             return u''
 171
 172
 173 class Container:
 174     def __init__(self, tag_name, *elems):
 175         self.tag_name = tag_name
 176         self.elems = elems
 177
 178     def __unicode__(self):
 179         s = u"<%s>" % self.tag_name
 180         add_nl = False
 181         for e in self.elems:
 182             if isinstance(e, (str, unicode)):
 183                 s += unicode(e)
 184             else:
 185                 s += "\n  " + unicode(e)
 186                 add_nl = True
 187
 188         if add_nl: s += "\n"
 189         s += u"</%s>" % self.tag_name
 190         return s
 191
 192
 193 def eatany(pos, *taggers):
 194     try:
 195         for t in list(taggers):
 196             p = t.tag(pos)
 197             if p:
 198                 return (t, p)
 199     except IndexError:
 200         pass
 201     return (None, pos)
 202
 203
 204 def eatseq(pos, *taggers):
 205     good = []
 206     taggers = list(taggers[:])
 207     try:
 208         while len(taggers):
 209             p = taggers[0].tag(pos)
 210             if p is None:
 211                 return (tuple(good), pos)
 212             good.append(taggers.pop(0))
 213             # print "%d -> %d" % (pos, p)
 214             pos = p
 215
 216     except IndexError:
 217         print "Got index error for pos=%d" % pos
 218     return (tuple(good), pos)
 219
 220
 221 def tagger(text, pretty_print=False):
 222     """
 223 tagger(text) function name and signature is a contract.
 224 returns auto-tagged text
 225     """
 226     if not isinstance(text, unicode):
 227         text = unicode(text.decode('utf-8'))
 228     lines = text.split("\n")
 229     pos = 0
 230     content = []
 231     state = {}
 232     info = Informacje(state, lines)
 233
 234     ((info,), pos) = eatseq(pos, info)
 235
 236     # print "[i] %d. %s" % (pos, lines[pos])
 237
 238     content.append(info)
 239
 240     while True:
 241         x, pos = eatany(pos, info.spawn(Section),
 242                         info.spawn(List), info.spawn(Paragraph))
 243
 244         if x is not None:
 245             content.append(x)
 246         else:
 247             content.append(lines[pos])
 248             pos += 1
 249             if pos >= len(lines):
 250                 break
 251
 252     return toxml(content, pretty_print=pretty_print)
 253
 254 dc_fixed = {
 255     'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://edukacjamedialna.edu.pl).',
 256     'relation': u'moduły powiązane linki',
 257     'description.material': u'linki do załączników',
 258     'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
 259     }
 260
 261
 262 class NotFound(Exception):
 263     pass
 264
 265
 266 def find_block(content, title_re, begin=-1, end=-1):
 267     title_re = re.compile(title_re, re.I | re.UNICODE)
 268
 269     rb = -1
 270     if begin < 0: begin = 0
 271     if end < 0: end = len(content)
 272
 273     for i in range(begin, end):
 274         elem = content[i]
 275         if isinstance(elem, Paragraph):
 276             if title_re.match(elem.line):
 277                 rb = i
 278                 continue
 279         if isinstance(elem, Section):
 280             if title_re.match(elem.title):
 281                 rb = i
 282                 continue
 283         if rb >= 0:
 284             if isinstance(elem, List):
 285                 continue
 286             if isinstance(elem, Paragraph) and elem.line:
 287                 continue
 288             break
 289     if rb >= 0:
 290         return rb, i
 291     raise NotFound()
 292
 293
 294 def remove_block(content, title_re, removed=None):
 295     rb, re = find_block(content, title_re)
 296     if removed is not None and isinstance(removed, list):
 297         removed += content[rb:re][:]
 298     content[rb:re] = []
 299     return content
 300
 301
 302 def mark_activities(content):
 303     i = 0
 304     tl = len(content)
 305     is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
 306
 307     is_next_section = re.compile(r"^[IVX]+[.]? ")
 308     is_activity = re.compile(r"^[0-9]+[.] (.+)")
 309
 310     is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
 311     is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
 312     is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
 313     activity_props = {
 314         'pomoce': is_activity_tools,
 315         'forma': is_activity_work,
 316         'czas': is_activity_time
 317         }
 318     activities = []
 319
 320     in_activities = False
 321     ab = -1
 322     ae = -1
 323     while True:
 324         e = content[i]
 325         if isinstance(e, Section):
 326             if in_activities and \
 327                 is_next_section.match(e.title):
 328                 in_activities = False
 329
 330         if isinstance(e, Paragraph):
 331             if not in_activities and \
 332                 is_przebieg.match(e.line):
 333                 in_activities = True
 334
 335             if in_activities:
 336                 m = is_activity.match(e.line)
 337                 if m:
 338                     e.line = m.groups()[0]
 339                     ab = i
 340                 if is_activity_time.match(e.line):
 341                     ae = i + 1
 342                     activities.append((ab, ae))
 343         i += 1
 344         if i >= tl: break
 345
 346     activities.reverse()
 347     for ab, ae in activities:
 348         act_len = ae - ab
 349         info_start = ae
 350
 351         act_els = []
 352         act_els.append(Container("opis", content[ab]))
 353         for i in range(ab, ae):
 354             e = content[i]
 355             if isinstance(e, Paragraph):
 356                 for prop, pattern in activity_props.items():
 357                     m = pattern.match(e.line)
 358                     if m:
 359                         act_els.append(Container(prop, m.groups()[0]))
 360                         if info_start > i: info_start = i
 361         act_els.insert(1, Container('wskazowki',
 362                                     *content[ab + 1:info_start]))
 363         content[ab:ae] = [Container('aktywnosc', *act_els)]
 364     return content
 365
 366
 367 def mark_dictionary(content):
 368     db = -1
 369     de = -1
 370     i = 0
 371     is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
 372     is_dictentry = re.compile(r"([^-]+) - (.+)")
 373     slowniczek = content[0].spawn(List)
 374     slowniczek.type = 'slowniczek'
 375     while i < len(content):
 376         e = content[i]
 377         if isinstance(e, Section):
 378             if is_dictionary.match(e.title):
 379                 db = i + 1
 380             elif db >= 1:
 381                 de = i
 382                 content[db:de] = [slowniczek]
 383                 break
 384         elif db >= 0:
 385             if isinstance(e, Paragraph):
 386                 m = is_dictentry.match(e.line)
 387                 if m:
 388                     slowniczek.append([Container('definiendum', m.groups()[0]),
 389                                        Container('definiens', m.groups()[1])])
 390
 391                 else:
 392                     slowniczek.append(e)
 393         i += 1
 394
 395     return content
 396
 397
 398 def mark_czytelnia(content):
 399     db = -1
 400     de = -1
 401     i = 0
 402     czy_czytelnia = re.compile(r"[\s]*czytelnia[\s]*", re.I)
 403     czytelnia = content[0].spawn(List)
 404     czytelnia.type = 'czytelnia'
 405     while i < len(content):
 406         e = content[i]
 407         if isinstance(e, Section):
 408             if czy_czytelnia.match(e.title):
 409                 db = i + 1
 410             elif db >= 1:
 411                 de = i
 412                 content[db:de] = [czytelnia]
 413                 break
 414         elif db >= 0:
 415             if isinstance(e, Paragraph):
 416                 if e.line:
 417                     czytelnia.append(e.line)
 418         i += 1
 419
 420     return content
 421
 422
 423
 424 def move_evaluation(content):
 425     evaluation = []
 426
 427     content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation)
 428     if evaluation:
 429         #        print "found evaluation %s" % (evaluation,)
 430         evaluation[0].is_podrozdzial = True
 431         # evaluation place
 432         opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*")
 433         if opcje_dodatkowe:
 434             #            print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, )
 435             content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation
 436         else:
 437             materialy = find_block(content, r"materia.y[+ AP\[\].]*")
 438             if materialy:
 439                 #                print "putting evaluation just before materialy @ %s" % (materialy, )
 440                 content[materialy[0]:materialy[0]] = evaluation
 441             else:
 442                 print "er.. no idea where to place evaluation"
 443     return content
 444
 445
 446 def toxml(content, pretty_print=False):
 447     # some transformations
 448     content = mark_activities(content)
 449     content = mark_dictionary(content)
 450     content = mark_czytelnia(content)
 451
 452     try:
 453         content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
 454     except NotFound:
 455         pass
 456     try:
 457         content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
 458     except NotFound:
 459         pass
 460     content = move_evaluation(content)
 461
 462     info = content.pop(0)
 463
 464     state = info.state
 465     meta = state['meta']
 466     slug = slughifi(meta.get(u'Tytuł modułu', ''))
 467     holder = {}
 468     holder['xml'] = u""
 469
 470     def p(t):
 471         holder['xml'] += u"%s\n" % t
 472
 473     def dc(k, v):
 474         p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
 475
 476     def t(tag, ct):
 477         p(u'<%s>%s</%s>' % (tag, ct, tag))
 478
 479     def a(ct):
 480         if ct:
 481             t(u'akap', ct)
 482
 483     p("<utwor>")
 484     p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
 485     p(u'<rdf:Description rdf:about="http://redakcja.edukacjamedialna.edu.pl/documents/">')
 486     authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
 487     for author in authors:
 488         names = author.split(u' ')
 489         lastname = names.pop()
 490         names.insert(0, lastname + ",")
 491         author = u' '.join(names)
 492         dc(u'creator', author)
 493     dc(u'title', meta.get(u'Tytuł modułu', u''))
 494     dc(u'relation.isPartOf', meta.get(u'Dział', u''))
 495     dc(u'publisher', u'Fundacja Nowoczesna Polska')
 496     dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
 497     dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
 498     for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
 499         keyword = keyword.strip()
 500         dc(u'subject', keyword)
 501     dc(u'description', dc_fixed['description'])
 502     dc(u'description.material', dc_fixed['description.material'])
 503     dc(u'relation', dc_fixed['relation'])
 504     dc(u'identifier.url', u'http://edukacjamedialna.edu.pl/%s' % slug)
 505     dc(u'rights', dc_fixed['rights'])
 506     dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
 507     dc(u'format', u'xml')
 508     dc(u'type', u'text')
 509     dc(u'date', u'2012-11-09')  # TODO
 510     dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
 511     dc(u'language', u'pol')
 512     p(u'</rdf:Description>')
 513     p(u'</rdf:RDF>')
 514
 515     p(u'<powiesc>')
 516     t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
 517     #    p(u'<nota>')
 518     a(u'<!-- Numer porządkowy: %s -->' % meta.get(u'Numer porządkowy', u''))
 519     #    p(u'</nota>')
 520
 521     p(unicode(info.title))
 522     for elm in content:
 523         if isinstance(elm, unicode) or isinstance(elm, str):
 524             a(elm)
 525             continue
 526         p(unicode(elm))
 527
 528     p(u'</powiesc>')
 529     p(u'</utwor>')
 530
 531     if pretty_print:
 532         from lxml import etree
 533         from StringIO import StringIO
 534         xml = etree.parse(StringIO(holder['xml']))
 535         holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode)
 536
 537     return holder['xml']
 538
 539
 540 # TODO / TBD
 541 # ogarnąć podrozdziały
 542 #  Przebieg zajęć
 543 #  opcje dodatkowe
 544 # usunąć 'opis zawartości'
 545 # akapit łączony?