from slughifi import slughifi
-class Tagger:
+class Tagger(object):
def __init__(self, state, lines):
self.state = state
self.lines = lines
def line(self, position):
return self.lines[position]
- ignore = [ re.compile(r"^[\[][PA][\]] - [^ ]+$") ]
+ ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
empty_line = re.compile(r"^\s+$")
-
+
def skip_empty(self, position):
while self.line(position) == "" or \
self.empty_line.match(self.line(position)) or \
position += 1
return position
-
def tag(self, position):
"""
Return None -- means that we can't tag it in any way
def wrap(self, tagname, content):
return u"<%s>%s</%s>" % (tagname, content, tagname)
+ @staticmethod
+ def anymatches(regex):
+ return lambda x: regex.match(x)
+
class Section(Tagger):
looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
+ def __init__(self, *a):
+ super(Section, self).__init__(*a)
+ self.is_podrozdzial = False
+
def tag(self, pos):
pos2 = self.skip_empty(pos)
pos = pos2
return pos + 1
def __unicode__(self):
- return self.wrap("naglowek_rozdzial", self.title)
+ return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial",
+ self.title)
class Meta(Tagger):
class List(Tagger):
- def tag(self, pos):
+ point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
+ num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
+
+ def __init__(self, *args):
+
+ super(List, self).__init__(*args)
self.items = []
+ self.type = 'punkt'
+
+ def tag(self, pos):
while True:
l = self.line(pos)
- if l and l[0] in ('-', '*'):
- self.items.append(l[1:].strip())
+ m = self.point.match(l)
+ if not m:
+ m = self.num.match(l)
+ if m: self.type = 'num'
+ if l and m:
+ self.items.append(m.groups()[0].lstrip())
pos += 1
else:
break
if self.items:
return pos
+ def append(self, tagger):
+ self.items.append(tagger)
+
def __unicode__(self):
- s = "<lista>\n"
+ s = '<lista typ="%s">' % self.type
for i in self.items:
- s += "<punkt>%s</punkt>\n" % i
- s += "</lista>\n"
+ if isinstance(i, list):
+ x = "\n".join(map(lambda elem: unicode(elem), i))
+ else:
+ x = unicode(i)
+ s += "\n<punkt>%s</punkt>" % x
+ s += "\n</lista>\n"
return s
class Paragraph(Tagger):
remove_this = [
re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
- re.compile(r"^[\s]*$")
+ re.compile(r"^[\s]*$"),
+ re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek")
]
podrozdzial = [
- re.compile(r"[\s]*(przebieg zajęć|opcje dodatkowe)[\s]*", re.I),
+ re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
]
+
def tag(self, pos):
self.line = self.lines[pos]
self.ignore = False
self.is_podrozdzial = False
-
+
for x in self.remove_this:
if x.match(self.line):
self.ignore = True
-
+
for x in self.podrozdzial:
if x.match(self.line):
self.is_podrozdzial = True
-
+
return pos + 1
def __unicode__(self):
return (tuple(good), pos)
-def tagger(text):
+def tagger(text, pretty_print=False):
"""
tagger(text) function name and signature is a contract.
returns auto-tagged text
content = []
state = {}
info = Informacje(state, lines)
-
+
((info,), pos) = eatseq(pos, info)
# print "[i] %d. %s" % (pos, lines[pos])
if pos >= len(lines):
break
- return toxml(content)
+ return toxml(content, pretty_print=pretty_print)
dc_fixed = {
- 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
- 'relation': u'moduły powiązane linki',
+ 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://edukacjamedialna.edu.pl).',
+ 'relation': u'moduły powiązane linki',
'description.material': u'linki do załączników',
'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
}
+class NotFound(Exception):
+ pass
+
+
+def find_block(content, title_re, begin=-1, end=-1):
+ title_re = re.compile(title_re, re.I | re.UNICODE)
+
+ rb = -1
+ if begin < 0: begin = 0
+ if end < 0: end = len(content)
+
+ for i in range(begin, end):
+ elem = content[i]
+ if isinstance(elem, Paragraph):
+ if title_re.match(elem.line):
+ rb = i
+ continue
+ if isinstance(elem, Section):
+ if title_re.match(elem.title):
+ rb = i
+ continue
+ if rb >= 0:
+ if isinstance(elem, List):
+ continue
+ if isinstance(elem, Paragraph) and elem.line:
+ continue
+ break
+ if rb >= 0:
+ return rb, i
+ raise NotFound()
+
+
+def remove_block(content, title_re, removed=None):
+ rb, re = find_block(content, title_re)
+ if removed is not None and isinstance(removed, list):
+ removed += content[rb:re][:]
+ content[rb:re] = []
+ return content
+
+
def mark_activities(content):
i = 0
tl = len(content)
is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
- # import pdb; pdb.set_trace()
+
is_next_section = re.compile(r"^[IVX]+[.]? ")
- is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+ is_activity = re.compile(r"^[0-9]+[.] (.+)")
is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
ae = -1
while True:
e = content[i]
+ if isinstance(e, Section):
+ if in_activities and \
+ is_next_section.match(e.title):
+ in_activities = False
+
if isinstance(e, Paragraph):
if not in_activities and \
is_przebieg.match(e.line):
in_activities = True
- if in_activities and \
- is_next_section.match(e.line):
- in_activities = False
if in_activities:
m = is_activity.match(e.line)
if m:
i = 0
is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
is_dictentry = re.compile(r"([^-]+) - (.+)")
- slowniczek = []
+ slowniczek = content[0].spawn(List)
+ slowniczek.type = 'slowniczek'
while i < len(content):
e = content[i]
if isinstance(e, Section):
db = i + 1
elif db >= 1:
de = i
- content[db:de] = [Container('slowniczek', *slowniczek)]
+ content[db:de] = [slowniczek]
+ break
elif db >= 0:
if isinstance(e, Paragraph):
m = is_dictentry.match(e.line)
if m:
- slowniczek.append(Container('definiendum', m.groups()[0]))
- slowniczek.append(Container('definiens', m.groups()[1]))
+ slowniczek.append([Container('definiendum', m.groups()[0]),
+ Container('definiens', m.groups()[1])])
+
else:
- slowniczek.append(e)
+ slowniczek.append(e.line)
i += 1
return content
-def toxml(content):
+def mark_czytelnia(content):
+ db = -1
+ de = -1
+ i = 0
+ czy_czytelnia = re.compile(r"[\s]*czytelnia[\s]*", re.I)
+ czytelnia = content[0].spawn(List)
+ czytelnia.type = 'czytelnia'
+ while i < len(content):
+ e = content[i]
+ if isinstance(e, Section):
+ if czy_czytelnia.match(e.title):
+ db = i + 1
+ elif db >= 1:
+ de = i
+ content[db:de] = [czytelnia]
+ break
+ elif db >= 0:
+ if isinstance(e, Paragraph):
+ if e.line:
+ czytelnia.append(e.line)
+ i += 1
+
+ return content
+
+
+
+def move_evaluation(content):
+ evaluation = []
+
+ content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation)
+ if evaluation:
+ # print "found evaluation %s" % (evaluation,)
+ evaluation[0].is_podrozdzial = True
+ # evaluation place
+ opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*")
+ if opcje_dodatkowe:
+ # print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, )
+ content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation
+ else:
+ materialy = find_block(content, r"materia.y[+ AP\[\].]*")
+ if materialy:
+ # print "putting evaluation just before materialy @ %s" % (materialy, )
+ content[materialy[0]:materialy[0]] = evaluation
+ else:
+ print "er.. no idea where to place evaluation"
+ return content
+
+
+def toxml(content, pretty_print=False):
+ # some transformations
content = mark_activities(content)
content = mark_dictionary(content)
+ content = mark_czytelnia(content)
+
+ try:
+ content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
+ except NotFound:
+ pass
+ try:
+ content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+ except NotFound:
+ pass
+ content = move_evaluation(content)
+
info = content.pop(0)
state = info.state
p("<utwor>")
p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
- p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
+ p(u'<rdf:Description rdf:about="http://redakcja.edukacjamedialna.edu.pl/documents/">')
authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
for author in authors:
names = author.split(u' ')
dc(u'description', dc_fixed['description'])
dc(u'description.material', dc_fixed['description.material'])
dc(u'relation', dc_fixed['relation'])
- dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
+ dc(u'identifier.url', u'http://edukacjamedialna.edu.pl/%s' % slug)
dc(u'rights', dc_fixed['rights'])
dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
dc(u'format', u'xml')
p(u'<powiesc>')
t(u'nazwa_utworu', meta.get(u'Tytuł modułu', u''))
- p(u'<nota>')
- a(u'Numer porządkowy: %s' % meta.get(u'Numer porządkowy', u''))
- p(u'</nota>')
+ # p(u'<nota>')
+ a(u'<!-- Numer porządkowy: %s -->' % meta.get(u'Numer porządkowy', u''))
+ # p(u'</nota>')
p(unicode(info.title))
for elm in content:
p(u'</powiesc>')
p(u'</utwor>')
+ if pretty_print:
+ from lxml import etree
+ from StringIO import StringIO
+ xml = etree.parse(StringIO(holder['xml']))
+ holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode)
+
return holder['xml']