2 # -*- coding: utf-8 -*-
4 from slughifi import slughifi
8 def __init__(self, state, lines):
13 return cls(self.state, self.lines)
15 def line(self, position):
16 return self.lines[position]
18 empty_line = re.compile(r"\s+")
20 def skip_empty(self, position):
21 while self.line(position) == "" or \
22 self.empty_line.match(self.line(position)):
26 def tag(self, position):
28 Return None -- means that we can't tag it in any way
32 def wrap(self, tagname, content):
33 return u"<%s>%s</%s>" % (tagname, content, tagname)
36 class Section(Tagger):
37 looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
40 pos2 = self.skip_empty(pos)
42 m = self.looks_like.match(self.line(pos))
44 self.title = m.groups()[0]
47 def __unicode__(self):
48 return self.wrap("naglowek_rozdzial", self.title)
52 looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
55 pos = self.skip_empty(pos)
56 m = self.looks_like.match(self.line(pos))
60 m = self.state.get('meta', {})
62 self.state['meta'] = m
66 class Informacje(Tagger):
68 self.title = self.spawn(Section)
70 pos = self.title.tag(pos)
71 if pos is None: return
75 pos = self.skip_empty(pos)
76 meta = self.spawn(Meta)
78 if pos2 is None: break
79 self.meta.append(meta)
90 if l and l[0] in ('-', '*'):
91 self.items.append(l[1:].strip())
98 def __unicode__(self):
101 s += "<punkt>%s</punkt>\n" % i
106 class Paragraph(Tagger):
108 re.compile(r"[\s]*opis zawartości[\s]*", re.I),
109 re.compile(r"^[\s]*$")
112 re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
115 self.line = self.lines[pos]
117 self.is_podrozdzial = False
119 for x in self.remove_this:
120 if x.match(self.line):
123 for x in self.podrozdzial:
124 if x.match(self.line):
125 self.is_podrozdzial = True
129 def __unicode__(self):
131 if self.is_podrozdzial:
132 tag = 'naglowek_podrozdzial'
135 return u"<%s>%s</%s>" % (tag, self.line, tag)
141 def __init__(self, tag_name, *elems):
142 self.tag_name = tag_name
145 def __unicode__(self):
146 s = u"<%s>" % self.tag_name
149 if isinstance(e, (str, unicode)):
152 s += "\n " + unicode(e)
156 s += u"</%s>" % self.tag_name
160 def eatany(pos, *taggers):
162 for t in list(taggers):
171 def eatseq(pos, *taggers):
173 taggers = list(taggers[:])
176 p = taggers[0].tag(pos)
178 return (tuple(good), pos)
179 good.append(taggers.pop(0))
180 # print "%d -> %d" % (pos, p)
184 print "Got index error for pos=%d" % pos
185 return (tuple(good), pos)
190 tagger(text) function name and signature is a contract.
191 returns auto-tagged text
193 if not isinstance(text, unicode):
194 text = unicode(text.decode('utf-8'))
195 lines = text.split("\n")
199 info = Informacje(state, lines)
201 ((info,), pos) = eatseq(pos, info)
203 # print "[i] %d. %s" % (pos, lines[pos])
208 x, pos = eatany(pos, info.spawn(Section),
209 info.spawn(List), info.spawn(Paragraph))
214 content.append(lines[pos])
216 if pos >= len(lines):
219 return toxml(content)
222 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
223 'relation': u'moduły powiązane linki',
224 'description.material': u'linki do załączników',
225 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
229 def mark_activities(content):
232 is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
233 # import pdb; pdb.set_trace()
234 is_next_section = re.compile(r"^[IVX]+[.]? ")
235 is_activity = re.compile(r"^[0-9]+[.]? (.+)")
237 is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
238 is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
239 is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
241 'pomoce': is_activity_tools,
242 'forma': is_activity_work,
243 'czas': is_activity_time
247 in_activities = False
252 if isinstance(e, Paragraph):
253 if not in_activities and \
254 is_przebieg.match(e.line):
257 if in_activities and \
258 is_next_section.match(e.line):
259 in_activities = False
261 m = is_activity.match(e.line)
263 e.line = m.groups()[0]
265 if is_activity_time.match(e.line):
267 activities.append((ab, ae))
272 for ab, ae in activities:
277 act_els.append(Container("opis", content[ab]))
278 for i in range(ab, ae):
280 if isinstance(e, Paragraph):
281 for prop, pattern in activity_props.items():
282 m = pattern.match(e.line)
284 act_els.append(Container(prop, m.groups()[0]))
285 if info_start > i: info_start = i
286 act_els.insert(1, Container('wskazowki',
287 *content[ab + 1:info_start]))
288 content[ab:ae] = [Container('aktywnosc', *act_els)]
292 def mark_dictionary(content):
296 is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
297 is_dictentry = re.compile(r"([^-]+) - (.+)")
299 while i < len(content):
301 if isinstance(e, Section):
302 if is_dictionary.match(e.title):
306 content[db:de] = [Container('slowniczek', *slowniczek)]
308 if isinstance(e, Paragraph):
309 m = is_dictentry.match(e.line)
311 slowniczek.append(Container('definiendum', m.groups()[0]))
312 slowniczek.append(Container('definiens', m.groups()[1]))
321 content = mark_activities(content)
322 content = mark_dictionary(content)
323 info = content.pop(0)
327 slug = slughifi(meta.get(u'Tytuł modułu', ''))
332 holder['xml'] += u"%s\n" % t
335 p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
338 p(u'<%s>%s</%s>' % (tag, ct, tag))
345 p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
346 p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
347 authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
348 for author in authors:
349 names = author.split(u' ')
350 lastname = names.pop()
351 names.insert(0, lastname + ",")
352 author = u' '.join(names)
353 dc(u'creator', author)
354 dc(u'title', meta.get(u'Tytuł modułu', u''))
355 dc(u'relation.isPartOf', meta.get(u'Dział', u''))
356 dc(u'publisher', u'Fundacja Nowoczesna Polska')
357 dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
358 dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
359 for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
360 keyword = keyword.strip()
361 dc(u'subject', keyword)
362 dc(u'description', dc_fixed['description'])
363 dc(u'description.material', dc_fixed['description.material'])
364 dc(u'relation', dc_fixed['relation'])
365 dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
366 dc(u'rights', dc_fixed['rights'])
367 dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
368 dc(u'format', u'xml')
370 dc(u'date', u'2012-11-09') # TODO
371 dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
372 dc(u'language', u'pol')
373 p(u'</rdf:Description>')
377 t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
379 a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
382 p(unicode(info.title))
384 if isinstance(elm, unicode) or isinstance(elm, str):
396 # ogarnąć podrozdziały
399 # usunąć 'opis zawartości'