2 # -*- coding: utf-8 -*-
4 from slughifi import slughifi
8 def __init__(self, state, lines):
13 return cls(self.state, self.lines)
15 def line(self, position):
16 return self.lines[position]
18 ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
19 empty_line = re.compile(r"^\s+$")
21 def skip_empty(self, position):
22 while self.line(position) == "" or \
23 self.empty_line.match(self.line(position)) or \
24 filter(lambda r: r.match(self.line(position)),
29 def tag(self, position):
31 Return None -- means that we can't tag it in any way
35 def wrap(self, tagname, content):
36 return u"<%s>%s</%s>" % (tagname, content, tagname)
39 def anymatches(regex):
40 return lambda x: regex.match(x)
43 class Section(Tagger):
44 looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
46 def __init__(self, *a):
47 super(Section, self).__init__(*a)
48 self.is_podrozdzial = False
51 pos2 = self.skip_empty(pos)
53 m = self.looks_like.match(self.line(pos))
55 self.title = m.groups()[0]
58 def __unicode__(self):
59 return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial",
64 looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
67 pos = self.skip_empty(pos)
68 m = self.looks_like.match(self.line(pos))
72 m = self.state.get('meta', {})
74 self.state['meta'] = m
78 class Informacje(Tagger):
80 self.title = self.spawn(Section)
82 pos = self.title.tag(pos)
83 if pos is None: return
87 pos = self.skip_empty(pos)
88 meta = self.spawn(Meta)
90 if pos2 is None: break
91 self.meta.append(meta)
98 point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
99 num = re.compile(r"^[\s]*[a-z]{1,2}[.]\s+(.*)")
101 def __init__(self, *args):
103 super(List, self).__init__(*args)
110 m = self.point.match(l)
112 m = self.num.match(l)
113 if m: self.type = 'num'
115 self.items.append(m.groups()[0].lstrip())
122 def append(self, tagger):
123 self.items.append(tagger)
125 def __unicode__(self):
126 s = '<lista typ="%s">' % self.type
128 if isinstance(i, list):
129 x = "\n".join(map(lambda elem: unicode(elem), i))
132 s += "\n<punkt>%s</punkt>" % x
137 class Paragraph(Tagger):
139 re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
140 re.compile(r"^[\s]*$")
143 re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
147 self.line = self.lines[pos]
149 self.is_podrozdzial = False
151 for x in self.remove_this:
152 if x.match(self.line):
155 for x in self.podrozdzial:
156 if x.match(self.line):
157 self.is_podrozdzial = True
161 def __unicode__(self):
163 if self.is_podrozdzial:
164 tag = 'naglowek_podrozdzial'
167 return u"<%s>%s</%s>" % (tag, self.line, tag)
173 def __init__(self, tag_name, *elems):
174 self.tag_name = tag_name
177 def __unicode__(self):
178 s = u"<%s>" % self.tag_name
181 if isinstance(e, (str, unicode)):
184 s += "\n " + unicode(e)
188 s += u"</%s>" % self.tag_name
192 def eatany(pos, *taggers):
194 for t in list(taggers):
203 def eatseq(pos, *taggers):
205 taggers = list(taggers[:])
208 p = taggers[0].tag(pos)
210 return (tuple(good), pos)
211 good.append(taggers.pop(0))
212 # print "%d -> %d" % (pos, p)
216 print "Got index error for pos=%d" % pos
217 return (tuple(good), pos)
220 def tagger(text, pretty_print=False):
222 tagger(text) function name and signature is a contract.
223 returns auto-tagged text
225 if not isinstance(text, unicode):
226 text = unicode(text.decode('utf-8'))
227 lines = text.split("\n")
231 info = Informacje(state, lines)
233 ((info,), pos) = eatseq(pos, info)
235 # print "[i] %d. %s" % (pos, lines[pos])
240 x, pos = eatany(pos, info.spawn(Section),
241 info.spawn(List), info.spawn(Paragraph))
246 content.append(lines[pos])
248 if pos >= len(lines):
251 return toxml(content, pretty_print=pretty_print)
254 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
255 'relation': u'moduły powiązane linki',
256 'description.material': u'linki do załączników',
257 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
261 def find_block(content, title_re, begin=-1, end=-1):
262 title_re = re.compile(title_re, re.I | re.UNICODE)
263 ## print "looking for %s" % title_re.pattern
264 if title_re.pattern[0:6] == 'pomoce':
265 import pdb; pdb.set_trace()
268 if begin < 0: begin = 0
269 if end < 0: end = len(content)
271 for i in range(begin, end):
273 if isinstance(elem, Paragraph):
274 if title_re.match(elem.line):
277 if isinstance(elem, Section):
278 if title_re.match(elem.title):
282 if isinstance(elem, List):
284 if isinstance(elem, Paragraph) and elem.line:
291 def remove_block(content, title_re, removed=None):
292 rb, re = find_block(content, title_re)
294 if removed is not None and isinstance(removed, list):
295 removed += content[rb:re][:]
300 def mark_activities(content):
303 is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
305 is_next_section = re.compile(r"^[IVX]+[.]? ")
306 is_activity = re.compile(r"^[0-9]+[.]? (.+)")
308 is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
309 is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
310 is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
312 'pomoce': is_activity_tools,
313 'forma': is_activity_work,
314 'czas': is_activity_time
318 in_activities = False
323 if isinstance(e, Paragraph):
324 if not in_activities and \
325 is_przebieg.match(e.line):
328 if in_activities and \
329 is_next_section.match(e.line):
330 in_activities = False
332 m = is_activity.match(e.line)
334 e.line = m.groups()[0]
336 if is_activity_time.match(e.line):
338 activities.append((ab, ae))
343 for ab, ae in activities:
348 act_els.append(Container("opis", content[ab]))
349 for i in range(ab, ae):
351 if isinstance(e, Paragraph):
352 for prop, pattern in activity_props.items():
353 m = pattern.match(e.line)
355 act_els.append(Container(prop, m.groups()[0]))
356 if info_start > i: info_start = i
357 act_els.insert(1, Container('wskazowki',
358 *content[ab + 1:info_start]))
359 content[ab:ae] = [Container('aktywnosc', *act_els)]
363 def mark_dictionary(content):
367 is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
368 is_dictentry = re.compile(r"([^-]+) - (.+)")
369 slowniczek = content[0].spawn(List)
370 slowniczek.type = 'slowniczek'
371 while i < len(content):
373 if isinstance(e, Section):
374 if is_dictionary.match(e.title):
378 content[db:de] = [slowniczek]
381 if isinstance(e, Paragraph):
382 m = is_dictentry.match(e.line)
384 slowniczek.append([Container('definiendum', m.groups()[0]),
385 Container('definiens', m.groups()[1])])
394 def move_evaluation(content):
397 content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation)
399 # print "found evaluation %s" % (evaluation,)
400 evaluation[0].is_podrozdzial = True
402 opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*")
404 # print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, )
405 content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation
407 materialy = find_block(content, r"materia.y[+ AP\[\].]*")
409 # print "putting evaluation just before materialy @ %s" % (materialy, )
410 content[materialy[0]:materialy[0]] = evaluation
412 print "er.. no idea where to place evaluation"
416 def toxml(content, pretty_print=False):
417 # some transformations
418 content = mark_activities(content)
419 content = mark_dictionary(content)
420 content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
421 content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
422 content = move_evaluation(content)
424 info = content.pop(0)
428 slug = slughifi(meta.get(u'Tytuł modułu', ''))
433 holder['xml'] += u"%s\n" % t
436 p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
439 p(u'<%s>%s</%s>' % (tag, ct, tag))
446 p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
447 p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
448 authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
449 for author in authors:
450 names = author.split(u' ')
451 lastname = names.pop()
452 names.insert(0, lastname + ",")
453 author = u' '.join(names)
454 dc(u'creator', author)
455 dc(u'title', meta.get(u'Tytuł modułu', u''))
456 dc(u'relation.isPartOf', meta.get(u'Dział', u''))
457 dc(u'publisher', u'Fundacja Nowoczesna Polska')
458 dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
459 dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
460 for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
461 keyword = keyword.strip()
462 dc(u'subject', keyword)
463 dc(u'description', dc_fixed['description'])
464 dc(u'description.material', dc_fixed['description.material'])
465 dc(u'relation', dc_fixed['relation'])
466 dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
467 dc(u'rights', dc_fixed['rights'])
468 dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
469 dc(u'format', u'xml')
471 dc(u'date', u'2012-11-09') # TODO
472 dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
473 dc(u'language', u'pol')
474 p(u'</rdf:Description>')
478 t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
480 a(u'<!-- Numer porządkowy: %s -->' % meta.get(u'Numer porządkowy', u''))
483 p(unicode(info.title))
485 if isinstance(elm, unicode) or isinstance(elm, str):
494 from lxml import etree
495 from StringIO import StringIO
496 xml = etree.parse(StringIO(holder['xml']))
497 holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode)
503 # ogarnąć podrozdziały
506 # usunąć 'opis zawartości'