2 # -*- coding: utf-8 -*-
4 from slughifi import slughifi
8 def __init__(self, state, lines):
13 return cls(self.state, self.lines)
15 def line(self, position):
16 return self.lines[position]
18 ignore = [ re.compile(r"^[\[][PA][\]] - [^ ]+$") ]
19 empty_line = re.compile(r"^\s+$")
21 def skip_empty(self, position):
22 while self.line(position) == "" or \
23 self.empty_line.match(self.line(position)) or \
24 filter(lambda r: r.match(self.line(position)),
30 def tag(self, position):
32 Return None -- means that we can't tag it in any way
36 def wrap(self, tagname, content):
37 return u"<%s>%s</%s>" % (tagname, content, tagname)
40 class Section(Tagger):
41 looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
44 pos2 = self.skip_empty(pos)
46 m = self.looks_like.match(self.line(pos))
48 self.title = m.groups()[0]
51 def __unicode__(self):
52 return self.wrap("naglowek_rozdzial", self.title)
56 looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
59 pos = self.skip_empty(pos)
60 m = self.looks_like.match(self.line(pos))
64 m = self.state.get('meta', {})
66 self.state['meta'] = m
70 class Informacje(Tagger):
72 self.title = self.spawn(Section)
74 pos = self.title.tag(pos)
75 if pos is None: return
79 pos = self.skip_empty(pos)
80 meta = self.spawn(Meta)
82 if pos2 is None: break
83 self.meta.append(meta)
94 if l and l[0] in ('-', '*'):
95 self.items.append(l[1:].strip())
102 def __unicode__(self):
105 s += "<punkt>%s</punkt>\n" % i
110 class Paragraph(Tagger):
112 re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
113 re.compile(r"^[\s]*$")
116 re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
119 self.line = self.lines[pos]
121 self.is_podrozdzial = False
123 for x in self.remove_this:
124 if x.match(self.line):
127 for x in self.podrozdzial:
128 if x.match(self.line):
129 self.is_podrozdzial = True
133 def __unicode__(self):
135 if self.is_podrozdzial:
136 tag = 'naglowek_podrozdzial'
139 return u"<%s>%s</%s>" % (tag, self.line, tag)
145 def __init__(self, tag_name, *elems):
146 self.tag_name = tag_name
149 def __unicode__(self):
150 s = u"<%s>" % self.tag_name
153 if isinstance(e, (str, unicode)):
156 s += "\n " + unicode(e)
160 s += u"</%s>" % self.tag_name
164 def eatany(pos, *taggers):
166 for t in list(taggers):
175 def eatseq(pos, *taggers):
177 taggers = list(taggers[:])
180 p = taggers[0].tag(pos)
182 return (tuple(good), pos)
183 good.append(taggers.pop(0))
184 # print "%d -> %d" % (pos, p)
188 print "Got index error for pos=%d" % pos
189 return (tuple(good), pos)
194 tagger(text) function name and signature is a contract.
195 returns auto-tagged text
197 if not isinstance(text, unicode):
198 text = unicode(text.decode('utf-8'))
199 lines = text.split("\n")
203 info = Informacje(state, lines)
205 ((info,), pos) = eatseq(pos, info)
207 # print "[i] %d. %s" % (pos, lines[pos])
212 x, pos = eatany(pos, info.spawn(Section),
213 info.spawn(List), info.spawn(Paragraph))
218 content.append(lines[pos])
220 if pos >= len(lines):
223 return toxml(content)
226 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
227 'relation': u'moduły powiązane linki',
228 'description.material': u'linki do załączników',
229 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
233 def mark_activities(content):
236 is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
237 # import pdb; pdb.set_trace()
238 is_next_section = re.compile(r"^[IVX]+[.]? ")
239 is_activity = re.compile(r"^[0-9]+[.]? (.+)")
241 is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
242 is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
243 is_activity_time = re.compile(r"^czas:[\s]*([\d]+).*")
245 'pomoce': is_activity_tools,
246 'forma': is_activity_work,
247 'czas': is_activity_time
251 in_activities = False
256 if isinstance(e, Paragraph):
257 if not in_activities and \
258 is_przebieg.match(e.line):
261 if in_activities and \
262 is_next_section.match(e.line):
263 in_activities = False
265 m = is_activity.match(e.line)
267 e.line = m.groups()[0]
269 if is_activity_time.match(e.line):
271 activities.append((ab, ae))
276 for ab, ae in activities:
281 act_els.append(Container("opis", content[ab]))
282 for i in range(ab, ae):
284 if isinstance(e, Paragraph):
285 for prop, pattern in activity_props.items():
286 m = pattern.match(e.line)
288 act_els.append(Container(prop, m.groups()[0]))
289 if info_start > i: info_start = i
290 act_els.insert(1, Container('wskazowki',
291 *content[ab + 1:info_start]))
292 content[ab:ae] = [Container('aktywnosc', *act_els)]
296 def mark_dictionary(content):
300 is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
301 is_dictentry = re.compile(r"([^-]+) - (.+)")
303 while i < len(content):
305 if isinstance(e, Section):
306 if is_dictionary.match(e.title):
310 content[db:de] = [Container('slowniczek', *slowniczek)]
312 if isinstance(e, Paragraph):
313 m = is_dictentry.match(e.line)
315 slowniczek.append(Container('definiendum', m.groups()[0]))
316 slowniczek.append(Container('definiens', m.groups()[1]))
325 content = mark_activities(content)
326 content = mark_dictionary(content)
327 info = content.pop(0)
331 slug = slughifi(meta.get(u'Tytuł modułu', ''))
336 holder['xml'] += u"%s\n" % t
339 p(u'<dc:%s xml:lang="pl" xmlns:dc="http://purl.org/dc/elements/1.1/">%s</dc:%s>' % (k, v, k))
342 p(u'<%s>%s</%s>' % (tag, ct, tag))
349 p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
350 p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
351 authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
352 for author in authors:
353 names = author.split(u' ')
354 lastname = names.pop()
355 names.insert(0, lastname + ",")
356 author = u' '.join(names)
357 dc(u'creator', author)
358 dc(u'title', meta.get(u'Tytuł modułu', u''))
359 dc(u'relation.isPartOf', meta.get(u'Dział', u''))
360 dc(u'publisher', u'Fundacja Nowoczesna Polska')
361 dc(u'subject.competence', meta.get(u'Wybrana kompetencja z Katalogu', u''))
362 dc(u'subject.curriculum', meta.get(u'Odniesienie do podstawy programowej', u''))
363 for keyword in meta.get(u'Słowa kluczowe', u'').split(u','):
364 keyword = keyword.strip()
365 dc(u'subject', keyword)
366 dc(u'description', dc_fixed['description'])
367 dc(u'description.material', dc_fixed['description.material'])
368 dc(u'relation', dc_fixed['relation'])
369 dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
370 dc(u'rights', dc_fixed['rights'])
371 dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
372 dc(u'format', u'xml')
374 dc(u'date', u'2012-11-09') # TODO
375 dc(u'audience', meta.get(u'Poziom edukacyjny', u''))
376 dc(u'language', u'pol')
377 p(u'</rdf:Description>')
381 t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
383 a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
386 p(unicode(info.title))
388 if isinstance(elm, unicode) or isinstance(elm, str):
400 # ogarnąć podrozdziały
403 # usunąć 'opis zawartości'