X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/e2d4fbf230c90dcf3e904351a70d399426478352..b6b62a76505574e1961e15900a96a70a5461f108:/apps/catalogue/management/edumed.py?ds=inline
diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py
index ffad2ec1..05034368 100644
--- a/apps/catalogue/management/edumed.py
+++ b/apps/catalogue/management/edumed.py
@@ -4,7 +4,7 @@ import re
from slughifi import slughifi
-class Tagger:
+class Tagger(object):
def __init__(self, state, lines):
self.state = state
self.lines = lines
@@ -15,9 +15,9 @@ class Tagger:
def line(self, position):
return self.lines[position]
- ignore = [ re.compile(r"^[\[][PA][\]] - [^ ]+$") ]
+ ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
empty_line = re.compile(r"^\s+$")
-
+
def skip_empty(self, position):
while self.line(position) == "" or \
self.empty_line.match(self.line(position)) or \
@@ -26,7 +26,6 @@ class Tagger:
position += 1
return position
-
def tag(self, position):
"""
Return None -- means that we can't tag it in any way
@@ -36,10 +35,18 @@ Return None -- means that we can't tag it in any way
def wrap(self, tagname, content):
return u"<%s>%s%s>" % (tagname, content, tagname)
+ @staticmethod
+ def anymatches(regex):
+ return lambda x: regex.match(x)
+
class Section(Tagger):
looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
+ def __init__(self, *a):
+ super(Section, self).__init__(*a)
+ self.is_podrozdzial = False
+
def tag(self, pos):
pos2 = self.skip_empty(pos)
pos = pos2
@@ -49,7 +56,8 @@ class Section(Tagger):
return pos + 1
def __unicode__(self):
- return self.wrap("naglowek_rozdzial", self.title)
+ return self.wrap(self.is_podrozdzial and "naglowek_podrozdzial" or "naglowek_rozdzial",
+ self.title)
class Meta(Tagger):
@@ -87,47 +95,68 @@ class Informacje(Tagger):
class List(Tagger):
- def tag(self, pos):
+ point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
+ num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
+
+ def __init__(self, *args):
+
+ super(List, self).__init__(*args)
self.items = []
+ self.type = 'punkt'
+
+ def tag(self, pos):
while True:
l = self.line(pos)
- if l and l[0] in ('-', '*'):
- self.items.append(l[1:].strip())
+ m = self.point.match(l)
+ if not m:
+ m = self.num.match(l)
+ if m: self.type = 'num'
+ if l and m:
+ self.items.append(m.groups()[0].lstrip())
pos += 1
else:
break
if self.items:
return pos
+ def append(self, tagger):
+ self.items.append(tagger)
+
def __unicode__(self):
- s = "\n"
+ s = '' % self.type
for i in self.items:
- s += "%s\n" % i
- s += "\n"
+ if isinstance(i, list):
+ x = "\n".join(map(lambda elem: unicode(elem), i))
+ else:
+ x = unicode(i)
+ s += "\n%s" % x
+ s += "\n\n"
return s
class Paragraph(Tagger):
remove_this = [
re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
- re.compile(r"^[\s]*$")
+ re.compile(r"^[\s]*$"),
+ re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek")
]
podrozdzial = [
- re.compile(r"[\s]*(przebieg zajÄÄ|opcje dodatkowe)[\s]*", re.I),
+ re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
]
+
def tag(self, pos):
self.line = self.lines[pos]
self.ignore = False
self.is_podrozdzial = False
-
+
for x in self.remove_this:
if x.match(self.line):
self.ignore = True
-
+
for x in self.podrozdzial:
if x.match(self.line):
self.is_podrozdzial = True
-
+
return pos + 1
def __unicode__(self):
@@ -189,7 +218,7 @@ def eatseq(pos, *taggers):
return (tuple(good), pos)
-def tagger(text):
+def tagger(text, pretty_print=False):
"""
tagger(text) function name and signature is a contract.
returns auto-tagged text
@@ -201,7 +230,7 @@ returns auto-tagged text
content = []
state = {}
info = Informacje(state, lines)
-
+
((info,), pos) = eatseq(pos, info)
# print "[i] %d. %s" % (pos, lines[pos])
@@ -220,23 +249,63 @@ returns auto-tagged text
if pos >= len(lines):
break
- return toxml(content)
+ return toxml(content, pretty_print=pretty_print)
dc_fixed = {
'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa PrzyszÅoÅÄ (http://cyfrowaprzyszlosc.pl).',
- 'relation': u'moduÅy powiÄ
zane linki',
+ 'relation': u'moduÅy powiÄ
zane linki',
'description.material': u'linki do zaÅÄ
czników',
'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
}
+class NotFound(Exception):
+ pass
+
+
+def find_block(content, title_re, begin=-1, end=-1):
+ title_re = re.compile(title_re, re.I | re.UNICODE)
+
+ rb = -1
+ if begin < 0: begin = 0
+ if end < 0: end = len(content)
+
+ for i in range(begin, end):
+ elem = content[i]
+ if isinstance(elem, Paragraph):
+ if title_re.match(elem.line):
+ rb = i
+ continue
+ if isinstance(elem, Section):
+ if title_re.match(elem.title):
+ rb = i
+ continue
+ if rb >= 0:
+ if isinstance(elem, List):
+ continue
+ if isinstance(elem, Paragraph) and elem.line:
+ continue
+ break
+ if rb >= 0:
+ return rb, i
+ raise NotFound()
+
+
+def remove_block(content, title_re, removed=None):
+ rb, re = find_block(content, title_re)
+ if removed is not None and isinstance(removed, list):
+ removed += content[rb:re][:]
+ content[rb:re] = []
+ return content
+
+
def mark_activities(content):
i = 0
tl = len(content)
is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
- # import pdb; pdb.set_trace()
+
is_next_section = re.compile(r"^[IVX]+[.]? ")
- is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+ is_activity = re.compile(r"^[0-9]+[.] (.+)")
is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
@@ -253,14 +322,16 @@ def mark_activities(content):
ae = -1
while True:
e = content[i]
+ if isinstance(e, Section):
+ if in_activities and \
+ is_next_section.match(e.title):
+ in_activities = False
+
if isinstance(e, Paragraph):
if not in_activities and \
is_przebieg.match(e.line):
in_activities = True
- if in_activities and \
- is_next_section.match(e.line):
- in_activities = False
if in_activities:
m = is_activity.match(e.line)
if m:
@@ -299,7 +370,8 @@ def mark_dictionary(content):
i = 0
is_dictionary = re.compile(r"[\s]*s.owniczek[\s]*", re.I)
is_dictentry = re.compile(r"([^-]+) - (.+)")
- slowniczek = []
+ slowniczek = content[0].spawn(List)
+ slowniczek.type = 'slowniczek'
while i < len(content):
e = content[i]
if isinstance(e, Section):
@@ -307,13 +379,15 @@ def mark_dictionary(content):
db = i + 1
elif db >= 1:
de = i
- content[db:de] = [Container('slowniczek', *slowniczek)]
+ content[db:de] = [slowniczek]
+ break
elif db >= 0:
if isinstance(e, Paragraph):
m = is_dictentry.match(e.line)
if m:
- slowniczek.append(Container('definiendum', m.groups()[0]))
- slowniczek.append(Container('definiens', m.groups()[1]))
+ slowniczek.append([Container('definiendum', m.groups()[0]),
+ Container('definiens', m.groups()[1])])
+
else:
slowniczek.append(e)
i += 1
@@ -321,9 +395,42 @@ def mark_dictionary(content):
return content
-def toxml(content):
+def move_evaluation(content):
+ evaluation = []
+
+ content = remove_block(content, r"ewaluacja[+ PA\[\].]*", evaluation)
+ if evaluation:
+ # print "found evaluation %s" % (evaluation,)
+ evaluation[0].is_podrozdzial = True
+ # evaluation place
+ opcje_dodatkowe = find_block(content, r"opcje dodatkowe\s*")
+ if opcje_dodatkowe:
+ # print "putting evaluation just before opcje dodatkowe @ %s" % (opcje_dodatkowe, )
+ content[opcje_dodatkowe[0]:opcje_dodatkowe[0]] = evaluation
+ else:
+ materialy = find_block(content, r"materia.y[+ AP\[\].]*")
+ if materialy:
+ # print "putting evaluation just before materialy @ %s" % (materialy, )
+ content[materialy[0]:materialy[0]] = evaluation
+ else:
+ print "er.. no idea where to place evaluation"
+ return content
+
+
+def toxml(content, pretty_print=False):
+ # some transformations
content = mark_activities(content)
content = mark_dictionary(content)
+ try:
+ content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
+ except NotFound:
+ pass
+ try:
+ content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+ except NotFound:
+ pass
+ content = move_evaluation(content)
+
info = content.pop(0)
state = info.state
@@ -379,9 +486,9 @@ def toxml(content):
p(u'')
t(u'nazwa_utworu', meta.get(u'TytuÅ moduÅu', u''))
- p(u'')
- a(u'Numer porzÄ
dkowy: %s' % meta.get(u'Numer porzÄ
dkowy', u''))
- p(u'')
+ # p(u'')
+ a(u'' % meta.get(u'Numer porzÄ
dkowy', u''))
+ # p(u'')
p(unicode(info.title))
for elm in content:
@@ -393,6 +500,12 @@ def toxml(content):
p(u'')
p(u'')
+ if pretty_print:
+ from lxml import etree
+ from StringIO import StringIO
+ xml = etree.parse(StringIO(holder['xml']))
+ holder['xml'] = etree.tostring(xml, pretty_print=pretty_print, encoding=unicode)
+
return holder['xml']