apps/catalogue/management/edumed.py

   1 # EduMed auto-tagger
   2 # -*- coding: utf-8 -*-
   3 import re
   4 from slughifi import slughifi
   5
   6
   7 class Tagger:
   8     def __init__(self, state, lines):
   9         self.state = state
  10         self.lines = lines
  11
  12     def spawn(self, cls):
  13         return cls(self.state, self.lines)
  14
  15     def line(self, position):
  16         return self.lines[position]
  17
  18     ignore = [ re.compile(r"^[\[][PA][\]] - [^ ]+$") ]
  19     empty_line = re.compile(r"^\s+$")
  20
  21     def skip_empty(self, position):
  22         while self.line(position) == "" or \
  23             self.empty_line.match(self.line(position)) or \
  24             filter(lambda r: r.match(self.line(position)),
  25                              self.ignore[:]):
  26             position += 1
  27         return position
  28
  29
  30     def tag(self, position):
  31         """
  32 Return None -- means that we can't tag it in any way
  33         """
  34         return None
  35
  36     def wrap(self, tagname, content):
  37         return u"<%s>%s</%s>" % (tagname, content, tagname)
  38
  39
  40 class Section(Tagger):
  41     looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
  42
  43     def tag(self, pos):
  44         pos2 = self.skip_empty(pos)
  45         pos = pos2
  46         m = self.looks_like.match(self.line(pos))
  47         if m:
  48             self.title = m.groups()[0]
  49             return pos + 1
  50
  51     def __unicode__(self):
  52         return self.wrap("naglowek_rozdzial", self.title)
  53
  54
  55 class Meta(Tagger):
  56     looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
  57
  58     def tag(self, pos):
  59         pos = self.skip_empty(pos)
  60         m = self.looks_like.match(self.line(pos))
  61         if m:
  62             k = m.groups()[0]
  63             v = m.groups()[1]
  64             m = self.state.get('meta', {})
  65             m[k] = v
  66             self.state['meta'] = m
  67             return pos + 1
  68
  69
  70 class Informacje(Tagger):
  71     def tag(self, pos):
  72         self.title = self.spawn(Section)
  73         self.meta = []
  74         pos = self.title.tag(pos)
  75         if pos is None: return
  76
  77             # collect meta
  78         while True:
  79             pos = self.skip_empty(pos)
  80             meta = self.spawn(Meta)
  81             pos2 = meta.tag(pos)
  82             if pos2 is None: break
  83             self.meta.append(meta)
  84             pos = pos2
  85
  86         return pos
  87
  88
  89 class List(Tagger):
  90     def tag(self, pos):
  91         self.items = []
  92         while True:
  93             l = self.line(pos)
  94             if l and l[0] in ('-', '*'):
  95                 self.items.append(l[1:].strip())
  96                 pos += 1
  97             else:
  98                 break
  99         if self.items:
 100             return pos
 101
 102     def __unicode__(self):
 103         s = "<lista>\n"
 104         for i in self.items:
 105             s += "<punkt>%s</punkt>\n" % i
 106         s += "</lista>\n"
 107         return s
 108
 109
 110 class Paragraph(Tagger):
 111     remove_this = [
 112         re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
 113         re.compile(r"^[\s]*$")
 114         ]
 115     podrozdzial = [
 116         re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
 117         ]
 118     def tag(self, pos):
 119         self.line = self.lines[pos]
 120         self.ignore = False
 121         self.is_podrozdzial = False
 122
 123         for x in self.remove_this:
 124             if x.match(self.line):
 125                 self.ignore = True
 126
 127         for x in self.podrozdzial:
 128             if x.match(self.line):
 129                 self.is_podrozdzial = True
 130
 131         return pos + 1
 132
 133     def __unicode__(self):
 134         if not self.ignore:
 135             if self.is_podrozdzial:
 136                 tag = 'naglowek_podrozdzial'
 137             else:
 138                 tag = 'akap'
 139             return u"<%s>%s</%s>" % (tag, self.line, tag)
 140         else:
 141             return u''
 142
 143
 144 class Container:
 145     def __init__(self, tag_name, *elems):
 146         self.tag_name = tag_name
 147         self.elems = elems
 148
 149     def __unicode__(self):
 150         s = u"<%s>" % self.tag_name
 151         add_nl = False
 152         for e in self.elems:
 153             if isinstance(e, (str, unicode)):
 154                 s += unicode(e)
 155             else:
 156                 s += "\n  " + unicode(e)
 157                 add_nl = True
 158
 159         if add_nl: s += "\n"
 160         s += u"</%s>" % self.tag_name
 161         return s
 162
 163
 164 def eatany(pos, *taggers):
 165     try:
 166         for t in list(taggers):
 167             p = t.tag(pos)
 168             if p:
 169                 return (t, p)
 170     except IndexError:
 171         pass
 172     return (None, pos)
 173
 174
 175 def eatseq(pos, *taggers):
 176     good = []
 177     taggers = list(taggers[:])
 178     try:
 179         while len(taggers):
 180             p = taggers[0].tag(pos)
 181             if p is None:
 182                 return (tuple(good), pos)
 183             good.append(taggers.pop(0))
 184             # print "%d -> %d" % (pos, p)
 185             pos = p
 186
 187     except IndexError:
 188         print "Got index error for pos=%d" % pos
 189     return (tuple(good), pos)
 190
 191
 192 def tagger(text):
 193     """
 194 tagger(text) function name and signature is a contract.
 195 returns auto-tagged text
 196     """
 197     if not isinstance(text, unicode):
 198         text = unicode(text.decode('utf-8'))
 199     lines = text.split("\n")
 200     pos = 0
 201     content = []
 202     state = {}
 203     info = Informacje(state, lines)
 204
 205     ((info,), pos) = eatseq(pos, info)
 206
 207     # print "[i] %d. %s" % (pos, lines[pos])
 208
 209     content.append(info)
 210
 211     while True:
 212         x, pos = eatany(pos, info.spawn(Section),
 213                         info.spawn(List), info.spawn(Paragraph))
 214
 215         if x is not None:
 216             content.append(x)
 217         else:
 218             content.append(lines[pos])
 219             pos += 1
 220             if pos >= len(lines):
 221                 break
 222
 223     return toxml(content)
 224
 225 dc_fixed = {
 226     'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
 227     'relation':  u'moduły powiązane linki',
 228     'description.material': u'linki do załączników',
 229     'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
 230     }
 231
 232
 233 def mark_activities(content):
 234     i = 0
 235     tl = len(content)
 236     is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
 237     #    import pdb; pdb.set_trace()
 238     is_next_section = re.compile(r"^[IVX]+[.]? ")
 239     is_activity = re.compile(r"^[0-9]+[.]? (.+)")
 240
 241     is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
 242     is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
 243     is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
 244     activity_props = {
 245         'pomoce': is_activity_tools,
 246         'forma': is_activity_work,
 247         'czas': is_activity_time
 248         }
 249     activities = []
 250
 251     in_activities = False
 252     ab = -1
 253     ae = -1
 254     while True:
 255         e = content[i]
 256         if isinstance(e, Paragraph):
 257             if not in_activities and \
 258                 is_przebieg.match(e.line):
 259                 in_activities = True
 260
 261             if in_activities and \
 262                 is_next_section.match(e.line):
 263                 in_activities = False
 264             if in_activities:
 265                 m = is_activity.match(e.line)
 266                 if m:
 267                     e.line = m.groups()[0]
 268                     ab = i
 269                 if is_activity_time.match(e.line):
 270                     ae = i + 1
 271                     activities.append((ab, ae))
 272         i += 1
 273         if i >= tl: break
 274
 275     activities.reverse()
 276     for ab, ae in activities:
 277         act_len = ae - ab
 278         info_start = ae
 279
 280         act_els = []
 281         act_els.append(Container("opis", content[ab]))
 282         for i in range(ab, ae):
 283             e = content[i]
 284             if isinstance(e, Paragraph):
 285                 for prop, pattern in activity_props.items():
 286                     m = pattern.match(e.line)
 287                     if m:
 288                         act_els.append(Container(prop, m.groups()[0]))
 289                         if info_start > i: info_start = i
 290         act_els.insert(1, Container('wskazowki',
 291                                     *content[ab + 1:info_start]))
 292         content[ab:ae] = [Container('aktywnosc', *act_els)]
 293     return content
 294
 295
 296 def mark_dictionary(content):
 297     db = -1
 298     de = -1
 299     i = 0
 300     is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
 301     is_dictentry = re.compile(r"([^-]+) - (.+)")
 302     slowniczek = []
 303     while i < len(content):
 304         e = content[i]
 305         if isinstance(e, Section):
 306             if is_dictionary.match(e.title):
 307                 db = i + 1
 308             elif db >= 1:
 309                 de = i
 310                 content[db:de] = [Container('slowniczek', *slowniczek)]
 311         elif db >= 0:
 312             if isinstance(e, Paragraph):
 313                 m = is_dictentry.match(e.line)
 314                 if m:
 315                     slowniczek.append(Container('definiendum', m.groups()[0]))
 316                     slowniczek.append(Container('definiens', m.groups()[1]))
 317                 else:
 318                     slowniczek.append(e)
 319         i += 1
 320
 321     return content
 322
 323
 324 def toxml(content):
 325     content = mark_activities(content)
 326     content = mark_dictionary(content)
 327     info = content.pop(0)
 328
 329     state = info.state
 330     meta = state['meta']
 331     slug = slughifi(meta.get(u'Tytuł modułu', ''))
 332     holder = {}
 333     holder['xml'] = u""
 334
 335     def p(t):
 336         holder['xml'] += u"%s\n" % t
 337
 338     def dc(k, v):
 339         p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
 340
 341     def t(tag, ct):
 342         p(u'<%s>%s</%s>' % (tag, ct, tag))
 343
 344     def a(ct):
 345         if ct:
 346             t(u'akap', ct)
 347
 348     p("<utwor>")
 349     p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
 350     p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
 351     authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
 352     for author in authors:
 353         names = author.split(u' ')
 354         lastname = names.pop()
 355         names.insert(0, lastname + ",")
 356         author = u' '.join(names)
 357         dc(u'creator', author)
 358     dc(u'title', meta.get(u'Tytuł modułu', u''))
 359     dc(u'relation.isPartOf', meta.get(u'Dział', u''))
 360     dc(u'publisher', u'Fundacja Nowoczesna Polska')
 361     dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
 362     dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
 363     for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
 364         keyword = keyword.strip()
 365         dc(u'subject', keyword)
 366     dc(u'description', dc_fixed['description'])
 367     dc(u'description.material', dc_fixed['description.material'])
 368     dc(u'relation', dc_fixed['relation'])
 369     dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
 370     dc(u'rights', dc_fixed['rights'])
 371     dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
 372     dc(u'format', u'xml')
 373     dc(u'type', u'text')
 374     dc(u'date', u'2012-11-09')  # TODO
 375     dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
 376     dc(u'language', u'pol')
 377     p(u'</rdf:Description>')
 378     p(u'</rdf:RDF>')
 379
 380     p(u'<powiesc>')
 381     t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
 382     p(u'<nota>')
 383     a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
 384     p(u'</nota>')
 385
 386     p(unicode(info.title))
 387     for elm in content:
 388         if isinstance(elm, unicode) or isinstance(elm, str):
 389             a(elm)
 390             continue
 391         p(unicode(elm))
 392
 393     p(u'</powiesc>')
 394     p(u'</utwor>')
 395
 396     return holder['xml']
 397
 398
 399 # TODO / TBD
 400 # ogarnąć podrozdziały
 401 #  Przebieg zajęć
 402 #  opcje dodatkowe
 403 # usunąć 'opis zawartości'
 404 # akapit łączony?