X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/65f4793d71ca86c80a7f4f7974e8d65a3ebe6f25..4a1f5b4c9e482dcc399940c027a9fdf6a74ea78a:/apps/catalogue/management/edumed.py
diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py
index e5d5ee0a..c319132a 100644
--- a/apps/catalogue/management/edumed.py
+++ b/apps/catalogue/management/edumed.py
@@ -4,7 +4,7 @@ import re
from slughifi import slughifi
-class Tagger:
+class Tagger(object):
def __init__(self, state, lines):
self.state = state
self.lines = lines
@@ -15,11 +15,14 @@ class Tagger:
def line(self, position):
return self.lines[position]
- empty_line = re.compile(r"\s+")
-
+ ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
+ empty_line = re.compile(r"^\s+$")
+
def skip_empty(self, position):
while self.line(position) == "" or \
- self.empty_line.match(self.line(position)):
+ self.empty_line.match(self.line(position)) or \
+ filter(lambda r: r.match(self.line(position)),
+ self.ignore[:]):
position += 1
return position
@@ -32,10 +35,18 @@ Return None -- means that we can't tag it in any way
def wrap(self, tagname, content):
return u"<%s>%s%s>" % (tagname, content, tagname)
+ @staticmethod
+ def anymatches(regex):
+ return lambda x: regex.match(x)
+
class Section(Tagger):
looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
+ def __init__(self, *a):
+ super(Section, self).__init__(*a)
+ self.is_podrozdzial = False
+
def tag(self, pos):
pos2 = self.skip_empty(pos)
pos = pos2
@@ -45,7 +56,8 @@ class Section(Tagger):
return pos + 1
def __unicode__(self):
- return self.wrap("naglowek_rozdzial", self.title)
+ return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial",
+ self.title)
class Meta(Tagger):
@@ -83,47 +95,67 @@ class Informacje(Tagger):
class List(Tagger):
- def tag(self, pos):
+ point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
+ num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
+
+ def __init__(self, *args):
+
+ super(List, self).__init__(*args)
self.items = []
+ self.type = 'punkt'
+
+ def tag(self, pos):
while True:
l = self.line(pos)
- if l and l[0] in ('-', '*'):
- self.items.append(l[1:].strip())
+ m = self.point.match(l)
+ if not m:
+ m = self.num.match(l)
+ if m: self.type = 'num'
+ if l and m:
+ self.items.append(m.groups()[0].lstrip())
pos += 1
else:
break
if self.items:
return pos
+ def append(self, tagger):
+ self.items.append(tagger)
+
def __unicode__(self):
- s = "\n"
+ s = '' % self.type
for i in self.items:
- s += "%s\n" % i
- s += "\n"
+ if isinstance(i, list):
+ x = "\n".join(map(lambda elem: unicode(elem), i))
+ else:
+ x = unicode(i)
+ s += "\n%s" % x
+ s += "\n\n"
return s
class Paragraph(Tagger):
remove_this = [
- re.compile(r"[\s]*opis zawartoÅci[\s]*", re.I),
+ re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
re.compile(r"^[\s]*$")
]
podrozdzial = [
- re.compile(r"[\s]*(przebieg zajÄÄ|opcje dodatkowe)[\s]*", re.I),
+ re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
]
+
def tag(self, pos):
self.line = self.lines[pos]
self.ignore = False
self.is_podrozdzial = False
-
+
for x in self.remove_this:
if x.match(self.line):
self.ignore = True
-
+
for x in self.podrozdzial:
if x.match(self.line):
self.is_podrozdzial = True
-
+
return pos + 1
def __unicode__(self):
@@ -185,7 +217,7 @@ def eatseq(pos, *taggers):
return (tuple(good), pos)
-def tagger(text):
+def tagger(text, pretty_print=False):
"""
tagger(text) function name and signature is a contract.
returns auto-tagged text
@@ -197,7 +229,7 @@ returns auto-tagged text
content = []
state = {}
info = Informacje(state, lines)
-
+
((info,), pos) = eatseq(pos, info)
# print "[i] %d. %s" % (pos, lines[pos])
@@ -216,23 +248,63 @@ returns auto-tagged text
if pos >= len(lines):
break
- return toxml(content)
+ return toxml(content, pretty_print=pretty_print)
dc_fixed = {
'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa PrzyszÅoÅÄ (http://cyfrowaprzyszlosc.pl).',
- 'relation': u'moduÅy powiÄ
zane linki',
+ 'relation': u'moduÅy powiÄ
zane linki',
'description.material': u'linki do zaÅÄ
czników',
'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
}
+class NotFound(Exception):
+ pass
+
+
+def find_block(content, title_re, begin=-1, end=-1):
+ title_re = re.compile(title_re, re.I | re.UNICODE)
+
+ rb = -1
+ if begin < 0: begin = 0
+ if end < 0: end = len(content)
+
+ for i in range(begin, end):
+ elem = content[i]
+ if isinstance(elem, Paragraph):
+ if title_re.match(elem.line):
+ rb = i
+ continue
+ if isinstance(elem, Section):
+ if title_re.match(elem.title):
+ rb = i
+ continue
+ if rb >= 0:
+ if isinstance(elem, List):
+ continue
+ if isinstance(elem, Paragraph) and elem.line:
+ continue
+ break
+ if rb >= 0:
+ return rb, i
+ raise NotFound()
+
+
+def remove_block(content, title_re, removed=None):
+ rb, re = find_block(content, title_re)
+ if removed is not None and isinstance(removed, list):
+ removed += content[rb:re][:]
+ content[rb:re] = []
+ return content
+
+
def mark_activities(content):
i = 0
tl = len(content)
is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
- # import pdb; pdb.set_trace()
+
is_next_section = re.compile(r"^[IVX]+[.]? ")
- is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+ is_activity = re.compile(r"^[0-9]+[.] (.+)")
is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
@@ -249,14 +321,16 @@ def mark_activities(content):
ae = -1
while True:
e = content[i]
+ if isinstance(e, Section):
+ if in_activities and \
+ is_next_section.match(e.title):
+ in_activities = False
+
if isinstance(e, Paragraph):
if not in_activities and \
is_przebieg.match(e.line):
in_activities = True
- if in_activities and \
- is_next_section.match(e.line):
- in_activities = False
if in_activities:
m = is_activity.match(e.line)
if m:
@@ -295,7 +369,8 @@ def mark_dictionary(content):
i = 0
is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
is_dictentry = re.compile(r"([^-]+) - (.+)")
- slowniczek = []
+ slowniczek = content[0].spawn(List)
+ slowniczek.type = 'slowniczek'
while i < len(content):
e = content[i]
if isinstance(e, Section):
@@ -303,13 +378,15 @@ def mark_dictionary(content):
db = i + 1
elif db >= 1:
de = i
- content[db:de] = [Container('slowniczek', *slowniczek)]
+ content[db:de] = [slowniczek]
+ break
elif db >= 0:
if isinstance(e, Paragraph):
m = is_dictentry.match(e.line)
if m:
- slowniczek.append(Container('definiendum', m.groups()[0]))
- slowniczek.append(Container('definiens', m.groups()[1]))
+ slowniczek.append([Container('definiendum', m.groups()[0]),
+ Container('definiens', m.groups()[1])])
+
else:
slowniczek.append(e)
i += 1
@@ -317,9 +394,42 @@ def mark_dictionary(content):
return content
-def toxml(content):
+def move_evaluation(content):
+ evaluation = []
+
+ content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation)
+ if evaluation:
+ # print "found evaluation %s" % (evaluation,)
+ evaluation[0].is_podrozdzial = True
+ # evaluation place
+ opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*")
+ if opcje_dodatkowe:
+ # print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, )
+ content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation
+ else:
+ materialy = find_block(content, r"materia.y[+ AP\[\].]*")
+ if materialy:
+ # print "putting evaluation just before materialy @ %s" % (materialy, )
+ content[materialy[0]:materialy[0]] = evaluation
+ else:
+ print "er.. no idea where to place evaluation"
+ return content
+
+
+def toxml(content, pretty_print=False):
+ # some transformations
content = mark_activities(content)
content = mark_dictionary(content)
+ try:
+ content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
+ except NotFound:
+ pass
+ try:
+ content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+ except NotFound:
+ pass
+ content = move_evaluation(content)
+
info = content.pop(0)
state = info.state
@@ -375,9 +485,9 @@ def toxml(content):
p(u'')
t(u'nazwa_utworu', meta.get(u'TytuÅ moduÅu', u''))
- p(u'')
- a(u'Numer porzÄ
dkowy: %s' % meta.get(u'Numer porzÄ
dkowy', u''))
- p(u'')
+ # p(u'')
+ a(u'' % meta.get(u'Numer porzÄ
dkowy', u''))
+ # p(u'')
p(unicode(info.title))
for elm in content:
@@ -389,6 +499,12 @@ def toxml(content):
p(u'')
p(u'')
+ if pretty_print:
+ from lxml import etree
+ from StringIO import StringIO
+ xml = etree.parse(StringIO(holder['xml']))
+ holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode)
+
return holder['xml']