class Meta(Tagger):
- looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
+ looks_like = re.compile(r"([^:]+): ?(.*)", re.UNICODE)
def tag(self, pos):
pos = self.skip_empty(pos)
class List(Tagger):
point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
- num = re.compile(r"^[\s]*[a-z]{1,2}[.]\s+(.*)")
+ num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
def __init__(self, *args):
class Paragraph(Tagger):
remove_this = [
re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
- re.compile(r"^[\s]*$")
+ re.compile(r"^[\s]*$"),
+ re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek")
]
podrozdzial = [
re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
return toxml(content, pretty_print=pretty_print)
dc_fixed = {
- 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
+ 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://edukacjamedialna.edu.pl).',
'relation': u'moduły powiązane linki',
'description.material': u'linki do załączników',
'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
}
+class NotFound(Exception):
+ pass
+
+
def find_block(content, title_re, begin=-1, end=-1):
title_re = re.compile(title_re, re.I | re.UNICODE)
- ## print "looking for %s" % title_re.pattern
- if title_re.pattern[0:6] == 'pomoce':
- import pdb; pdb.set_trace()
rb = -1
if begin < 0: begin = 0
break
if rb >= 0:
return rb, i
+ raise NotFound()
def remove_block(content, title_re, removed=None):
rb, re = find_block(content, title_re)
-
if removed is not None and isinstance(removed, list):
removed += content[rb:re][:]
content[rb:re] = []
is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
is_next_section = re.compile(r"^[IVX]+[.]? ")
- is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+ is_activity = re.compile(r"^[0-9]+[.] (.+)")
is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
ae = -1
while True:
e = content[i]
+ if isinstance(e, Section):
+ if in_activities and \
+ is_next_section.match(e.title):
+ in_activities = False
+
if isinstance(e, Paragraph):
if not in_activities and \
is_przebieg.match(e.line):
in_activities = True
- if in_activities and \
- is_next_section.match(e.line):
- in_activities = False
if in_activities:
m = is_activity.match(e.line)
if m:
Container('definiens', m.groups()[1])])
else:
- slowniczek.append(e)
+ slowniczek.append(e.line)
+ i += 1
+
+ return content
+
+
+def mark_czytelnia(content):
+ db = -1
+ de = -1
+ i = 0
+ czy_czytelnia = re.compile(r"[\s]*czytelnia[\s]*", re.I)
+ czytelnia = content[0].spawn(List)
+ czytelnia.type = 'czytelnia'
+ while i < len(content):
+ e = content[i]
+ if isinstance(e, Section):
+ if czy_czytelnia.match(e.title):
+ db = i + 1
+ elif db >= 1:
+ de = i
+ content[db:de] = [czytelnia]
+ break
+ elif db >= 0:
+ if isinstance(e, Paragraph):
+ if e.line:
+ czytelnia.append(e.line)
i += 1
return content
+
def move_evaluation(content):
evaluation = []
# some transformations
content = mark_activities(content)
content = mark_dictionary(content)
- content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
- content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+ content = mark_czytelnia(content)
+
+ try:
+ content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
+ except NotFound:
+ pass
+ try:
+ content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+ except NotFound:
+ pass
content = move_evaluation(content)
info = content.pop(0)
p("<utwor>")
p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
- p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
+ p(u'<rdf:Description rdf:about="http://redakcja.edukacjamedialna.edu.pl/documents/">')
authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
for author in authors:
names = author.split(u' ')
dc(u'description', dc_fixed['description'])
dc(u'description.material', dc_fixed['description.material'])
dc(u'relation', dc_fixed['relation'])
- dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
+ dc(u'identifier.url', u'http://edukacjamedialna.edu.pl/%s' % slug)
dc(u'rights', dc_fixed['rights'])
dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
dc(u'format', u'xml')