2 # -*- coding: utf-8 -*-
4 from slughifi import slughifi
8 def __init__(self, state, lines):
13 return cls(self.state, self.lines)
15 def line(self, position):
16 return self.lines[position]
18 ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
19 empty_line = re.compile(r"^\s+$")
21 def skip_empty(self, position):
22 while self.line(position) == "" or \
23 self.empty_line.match(self.line(position)) or \
24 filter(lambda r: r.match(self.line(position)),
29 def tag(self, position):
31 Return None -- means that we can't tag it in any way
35 def wrap(self, tagname, content):
36 return u"<%s>%s</%s>" % (tagname, content, tagname)
39 def anymatches(regex):
40 return lambda x: regex.match(x)
44 class Section(Tagger):
45 looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
48 pos2 = self.skip_empty(pos)
50 m = self.looks_like.match(self.line(pos))
52 self.title = m.groups()[0]
55 def __unicode__(self):
56 return self.wrap("naglowek_rozdzial", self.title)
60 looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
63 pos = self.skip_empty(pos)
64 m = self.looks_like.match(self.line(pos))
68 m = self.state.get('meta', {})
70 self.state['meta'] = m
74 class Informacje(Tagger):
76 self.title = self.spawn(Section)
78 pos = self.title.tag(pos)
79 if pos is None: return
83 pos = self.skip_empty(pos)
84 meta = self.spawn(Meta)
86 if pos2 is None: break
87 self.meta.append(meta)
94 point = re.compile(r"^[\s]*([-*])")
100 m = self.point.match(l)
102 self.items.append(l[1:].strip())
109 def __unicode__(self):
110 s = '<lista typ="punkt">'
112 s += "\n<punkt>%s</punkt>" % i
117 class Paragraph(Tagger):
119 re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
120 re.compile(r"^[\s]*$")
123 re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
126 self.line = self.lines[pos]
128 self.is_podrozdzial = False
130 for x in self.remove_this:
131 if x.match(self.line):
134 for x in self.podrozdzial:
135 if x.match(self.line):
136 self.is_podrozdzial = True
140 def __unicode__(self):
142 if self.is_podrozdzial:
143 tag = 'naglowek_podrozdzial'
146 return u"<%s>%s</%s>" % (tag, self.line, tag)
152 def __init__(self, tag_name, *elems):
153 self.tag_name = tag_name
156 def __unicode__(self):
157 s = u"<%s>" % self.tag_name
160 if isinstance(e, (str, unicode)):
163 s += "\n " + unicode(e)
167 s += u"</%s>" % self.tag_name
171 def eatany(pos, *taggers):
173 for t in list(taggers):
182 def eatseq(pos, *taggers):
184 taggers = list(taggers[:])
187 p = taggers[0].tag(pos)
189 return (tuple(good), pos)
190 good.append(taggers.pop(0))
191 # print "%d -> %d" % (pos, p)
195 print "Got index error for pos=%d" % pos
196 return (tuple(good), pos)
201 tagger(text) function name and signature is a contract.
202 returns auto-tagged text
204 if not isinstance(text, unicode):
205 text = unicode(text.decode('utf-8'))
206 lines = text.split("\n")
210 info = Informacje(state, lines)
212 ((info,), pos) = eatseq(pos, info)
214 # print "[i] %d. %s" % (pos, lines[pos])
219 x, pos = eatany(pos, info.spawn(Section),
220 info.spawn(List), info.spawn(Paragraph))
225 content.append(lines[pos])
227 if pos >= len(lines):
230 return toxml(content)
233 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
234 'relation': u'moduły powiązane linki',
235 'description.material': u'linki do załączników',
236 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
240 def mark_activities(content):
243 is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
244 # import pdb; pdb.set_trace()
245 is_next_section = re.compile(r"^[IVX]+[.]? ")
246 is_activity = re.compile(r"^[0-9]+[.]? (.+)")
248 is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
249 is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
250 is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
252 'pomoce': is_activity_tools,
253 'forma': is_activity_work,
254 'czas': is_activity_time
258 in_activities = False
263 if isinstance(e, Paragraph):
264 if not in_activities and \
265 is_przebieg.match(e.line):
268 if in_activities and \
269 is_next_section.match(e.line):
270 in_activities = False
272 m = is_activity.match(e.line)
274 e.line = m.groups()[0]
276 if is_activity_time.match(e.line):
278 activities.append((ab, ae))
283 for ab, ae in activities:
288 act_els.append(Container("opis", content[ab]))
289 for i in range(ab, ae):
291 if isinstance(e, Paragraph):
292 for prop, pattern in activity_props.items():
293 m = pattern.match(e.line)
295 act_els.append(Container(prop, m.groups()[0]))
296 if info_start > i: info_start = i
297 act_els.insert(1, Container('wskazowki',
298 *content[ab + 1:info_start]))
299 content[ab:ae] = [Container('aktywnosc', *act_els)]
303 def mark_dictionary(content):
307 is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
308 is_dictentry = re.compile(r"([^-]+) - (.+)")
310 while i < len(content):
312 if isinstance(e, Section):
313 if is_dictionary.match(e.title):
317 content[db:de] = [Container('slowniczek', *slowniczek)]
319 if isinstance(e, Paragraph):
320 m = is_dictentry.match(e.line)
322 slowniczek.append(Container('definiendum', m.groups()[0]))
323 slowniczek.append(Container('definiens', m.groups()[1]))
332 content = mark_activities(content)
333 content = mark_dictionary(content)
334 info = content.pop(0)
338 slug = slughifi(meta.get(u'Tytuł modułu', ''))
343 holder['xml'] += u"%s\n" % t
346 p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
349 p(u'<%s>%s</%s>' % (tag, ct, tag))
356 p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
357 p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
358 authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
359 for author in authors:
360 names = author.split(u' ')
361 lastname = names.pop()
362 names.insert(0, lastname + ",")
363 author = u' '.join(names)
364 dc(u'creator', author)
365 dc(u'title', meta.get(u'Tytuł modułu', u''))
366 dc(u'relation.isPartOf', meta.get(u'Dział', u''))
367 dc(u'publisher', u'Fundacja Nowoczesna Polska')
368 dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
369 dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
370 for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
371 keyword = keyword.strip()
372 dc(u'subject', keyword)
373 dc(u'description', dc_fixed['description'])
374 dc(u'description.material', dc_fixed['description.material'])
375 dc(u'relation', dc_fixed['relation'])
376 dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
377 dc(u'rights', dc_fixed['rights'])
378 dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
379 dc(u'format', u'xml')
381 dc(u'date', u'2012-11-09') # TODO
382 dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
383 dc(u'language', u'pol')
384 p(u'</rdf:Description>')
388 t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
390 a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
393 p(unicode(info.title))
395 if isinstance(elm, unicode) or isinstance(elm, str):
407 # ogarnąć podrozdziały
410 # usunąć 'opis zawartości'