apps/catalogue/management/edumed.py

   1 # EduMed auto-tagger
   2 # -*- coding: utf-8 -*-
   3 import re
   4 from slughifi import slughifi
   5
   6
   7 class Tagger:
   8     def __init__(self, state, lines):
   9         self.state = state
  10         self.lines = lines
  11
  12     def spawn(self, cls):
  13         return cls(self.state, self.lines)
  14
  15     def line(self, position):
  16         return self.lines[position]
  17
  18     ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
  19     empty_line = re.compile(r"^\s+$")
  20
  21     def skip_empty(self, position):
  22         while self.line(position) == "" or \
  23             self.empty_line.match(self.line(position)) or \
  24             filter(lambda r: r.match(self.line(position)),
  25                              self.ignore[:]):
  26             position += 1
  27         return position
  28
  29     def tag(self, position):
  30         """
  31 Return None -- means that we can't tag it in any way
  32         """
  33         return None
  34
  35     def wrap(self, tagname, content):
  36         return u"<%s>%s</%s>" % (tagname, content, tagname)
  37
  38     @staticmethod
  39     def anymatches(regex):
  40         return lambda x: regex.match(x)
  41
  42
  43
  44 class Section(Tagger):
  45     looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
  46
  47     def tag(self, pos):
  48         pos2 = self.skip_empty(pos)
  49         pos = pos2
  50         m = self.looks_like.match(self.line(pos))
  51         if m:
  52             self.title = m.groups()[0]
  53             return pos + 1
  54
  55     def __unicode__(self):
  56         return self.wrap("naglowek_rozdzial", self.title)
  57
  58
  59 class Meta(Tagger):
  60     looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
  61
  62     def tag(self, pos):
  63         pos = self.skip_empty(pos)
  64         m = self.looks_like.match(self.line(pos))
  65         if m:
  66             k = m.groups()[0]
  67             v = m.groups()[1]
  68             m = self.state.get('meta', {})
  69             m[k] = v
  70             self.state['meta'] = m
  71             return pos + 1
  72
  73
  74 class Informacje(Tagger):
  75     def tag(self, pos):
  76         self.title = self.spawn(Section)
  77         self.meta = []
  78         pos = self.title.tag(pos)
  79         if pos is None: return
  80
  81             # collect meta
  82         while True:
  83             pos = self.skip_empty(pos)
  84             meta = self.spawn(Meta)
  85             pos2 = meta.tag(pos)
  86             if pos2 is None: break
  87             self.meta.append(meta)
  88             pos = pos2
  89
  90         return pos
  91
  92
  93 class List(Tagger):
  94     point = re.compile(r"^[\s]*([-*])")
  95
  96     def tag(self, pos):
  97         self.items = []
  98         while True:
  99             l = self.line(pos)
 100             m = self.point.match(l)
 101             if l and m:
 102                 self.items.append(l[1:].strip())
 103                 pos += 1
 104             else:
 105                 break
 106         if self.items:
 107             return pos
 108
 109     def __unicode__(self):
 110         s = '<lista typ="punkt">'
 111         for i in self.items:
 112             s += "\n<punkt>%s</punkt>" % i
 113         s += "\n</lista>\n"
 114         return s
 115
 116
 117 class Paragraph(Tagger):
 118     remove_this = [
 119         re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
 120         re.compile(r"^[\s]*$")
 121         ]
 122     podrozdzial = [
 123         re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
 124         ]
 125     def tag(self, pos):
 126         self.line = self.lines[pos]
 127         self.ignore = False
 128         self.is_podrozdzial = False
 129
 130         for x in self.remove_this:
 131             if x.match(self.line):
 132                 self.ignore = True
 133
 134         for x in self.podrozdzial:
 135             if x.match(self.line):
 136                 self.is_podrozdzial = True
 137
 138         return pos + 1
 139
 140     def __unicode__(self):
 141         if not self.ignore:
 142             if self.is_podrozdzial:
 143                 tag = 'naglowek_podrozdzial'
 144             else:
 145                 tag = 'akap'
 146             return u"<%s>%s</%s>" % (tag, self.line, tag)
 147         else:
 148             return u''
 149
 150
 151 class Container:
 152     def __init__(self, tag_name, *elems):
 153         self.tag_name = tag_name
 154         self.elems = elems
 155
 156     def __unicode__(self):
 157         s = u"<%s>" % self.tag_name
 158         add_nl = False
 159         for e in self.elems:
 160             if isinstance(e, (str, unicode)):
 161                 s += unicode(e)
 162             else:
 163                 s += "\n  " + unicode(e)
 164                 add_nl = True
 165
 166         if add_nl: s += "\n"
 167         s += u"</%s>" % self.tag_name
 168         return s
 169
 170
 171 def eatany(pos, *taggers):
 172     try:
 173         for t in list(taggers):
 174             p = t.tag(pos)
 175             if p:
 176                 return (t, p)
 177     except IndexError:
 178         pass
 179     return (None, pos)
 180
 181
 182 def eatseq(pos, *taggers):
 183     good = []
 184     taggers = list(taggers[:])
 185     try:
 186         while len(taggers):
 187             p = taggers[0].tag(pos)
 188             if p is None:
 189                 return (tuple(good), pos)
 190             good.append(taggers.pop(0))
 191             # print "%d -> %d" % (pos, p)
 192             pos = p
 193
 194     except IndexError:
 195         print "Got index error for pos=%d" % pos
 196     return (tuple(good), pos)
 197
 198
 199 def tagger(text):
 200     """
 201 tagger(text) function name and signature is a contract.
 202 returns auto-tagged text
 203     """
 204     if not isinstance(text, unicode):
 205         text = unicode(text.decode('utf-8'))
 206     lines = text.split("\n")
 207     pos = 0
 208     content = []
 209     state = {}
 210     info = Informacje(state, lines)
 211
 212     ((info,), pos) = eatseq(pos, info)
 213
 214     # print "[i] %d. %s" % (pos, lines[pos])
 215
 216     content.append(info)
 217
 218     while True:
 219         x, pos = eatany(pos, info.spawn(Section),
 220                         info.spawn(List), info.spawn(Paragraph))
 221
 222         if x is not None:
 223             content.append(x)
 224         else:
 225             content.append(lines[pos])
 226             pos += 1
 227             if pos >= len(lines):
 228                 break
 229
 230     return toxml(content)
 231
 232 dc_fixed = {
 233     'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
 234     'relation':  u'moduły powiązane linki',
 235     'description.material': u'linki do załączników',
 236     'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
 237     }
 238
 239
 240 def mark_activities(content):
 241     i = 0
 242     tl = len(content)
 243     is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
 244     #    import pdb; pdb.set_trace()
 245     is_next_section = re.compile(r"^[IVX]+[.]? ")
 246     is_activity = re.compile(r"^[0-9]+[.]? (.+)")
 247
 248     is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
 249     is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
 250     is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
 251     activity_props = {
 252         'pomoce': is_activity_tools,
 253         'forma': is_activity_work,
 254         'czas': is_activity_time
 255         }
 256     activities = []
 257
 258     in_activities = False
 259     ab = -1
 260     ae = -1
 261     while True:
 262         e = content[i]
 263         if isinstance(e, Paragraph):
 264             if not in_activities and \
 265                 is_przebieg.match(e.line):
 266                 in_activities = True
 267
 268             if in_activities and \
 269                 is_next_section.match(e.line):
 270                 in_activities = False
 271             if in_activities:
 272                 m = is_activity.match(e.line)
 273                 if m:
 274                     e.line = m.groups()[0]
 275                     ab = i
 276                 if is_activity_time.match(e.line):
 277                     ae = i + 1
 278                     activities.append((ab, ae))
 279         i += 1
 280         if i >= tl: break
 281
 282     activities.reverse()
 283     for ab, ae in activities:
 284         act_len = ae - ab
 285         info_start = ae
 286
 287         act_els = []
 288         act_els.append(Container("opis", content[ab]))
 289         for i in range(ab, ae):
 290             e = content[i]
 291             if isinstance(e, Paragraph):
 292                 for prop, pattern in activity_props.items():
 293                     m = pattern.match(e.line)
 294                     if m:
 295                         act_els.append(Container(prop, m.groups()[0]))
 296                         if info_start > i: info_start = i
 297         act_els.insert(1, Container('wskazowki',
 298                                     *content[ab + 1:info_start]))
 299         content[ab:ae] = [Container('aktywnosc', *act_els)]
 300     return content
 301
 302
 303 def mark_dictionary(content):
 304     db = -1
 305     de = -1
 306     i = 0
 307     is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
 308     is_dictentry = re.compile(r"([^-]+) - (.+)")
 309     slowniczek = []
 310     while i < len(content):
 311         e = content[i]
 312         if isinstance(e, Section):
 313             if is_dictionary.match(e.title):
 314                 db = i + 1
 315             elif db >= 1:
 316                 de = i
 317                 content[db:de] = [Container('slowniczek', *slowniczek)]
 318                 break
 319         elif db >= 0:
 320             if isinstance(e, Paragraph):
 321                 m = is_dictentry.match(e.line)
 322                 if m:
 323                     slowniczek.append(Container('definiendum', m.groups()[0]))
 324                     slowniczek.append(Container('definiens', m.groups()[1]))
 325                 else:
 326                     slowniczek.append(e)
 327         i += 1
 328
 329     return content
 330
 331
 332 def toxml(content):
 333     content = mark_activities(content)
 334     content = mark_dictionary(content)
 335     info = content.pop(0)
 336
 337     state = info.state
 338     meta = state['meta']
 339     slug = slughifi(meta.get(u'Tytuł modułu', ''))
 340     holder = {}
 341     holder['xml'] = u""
 342
 343     def p(t):
 344         holder['xml'] += u"%s\n" % t
 345
 346     def dc(k, v):
 347         p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
 348
 349     def t(tag, ct):
 350         p(u'<%s>%s</%s>' % (tag, ct, tag))
 351
 352     def a(ct):
 353         if ct:
 354             t(u'akap', ct)
 355
 356     p("<utwor>")
 357     p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
 358     p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
 359     authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
 360     for author in authors:
 361         names = author.split(u' ')
 362         lastname = names.pop()
 363         names.insert(0, lastname + ",")
 364         author = u' '.join(names)
 365         dc(u'creator', author)
 366     dc(u'title', meta.get(u'Tytuł modułu', u''))
 367     dc(u'relation.isPartOf', meta.get(u'Dział', u''))
 368     dc(u'publisher', u'Fundacja Nowoczesna Polska')
 369     dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
 370     dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
 371     for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
 372         keyword = keyword.strip()
 373         dc(u'subject', keyword)
 374     dc(u'description', dc_fixed['description'])
 375     dc(u'description.material', dc_fixed['description.material'])
 376     dc(u'relation', dc_fixed['relation'])
 377     dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
 378     dc(u'rights', dc_fixed['rights'])
 379     dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
 380     dc(u'format', u'xml')
 381     dc(u'type', u'text')
 382     dc(u'date', u'2012-11-09')  # TODO
 383     dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
 384     dc(u'language', u'pol')
 385     p(u'</rdf:Description>')
 386     p(u'</rdf:RDF>')
 387
 388     p(u'<powiesc>')
 389     t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
 390     p(u'<nota>')
 391     a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
 392     p(u'</nota>')
 393
 394     p(unicode(info.title))
 395     for elm in content:
 396         if isinstance(elm, unicode) or isinstance(elm, str):
 397             a(elm)
 398             continue
 399         p(unicode(elm))
 400
 401     p(u'</powiesc>')
 402     p(u'</utwor>')
 403
 404     return holder['xml']
 405
 406
 407 # TODO / TBD
 408 # ogarnąć podrozdziały
 409 #  Przebieg zajęć
 410 #  opcje dodatkowe
 411 # usunąć 'opis zawartości'
 412 # akapit łączony?