X-Git-Url: https://git.mdrn.pl/redakcja.git/blobdiff_plain/02500a11ed4bd76a6fc32f3e8676365eb344f771..6e8379333178e150cb4783e99342e55abc576589:/apps/catalogue/management/edumed.py diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py index 76d39649..25f30849 100644 --- a/apps/catalogue/management/edumed.py +++ b/apps/catalogue/management/edumed.py @@ -61,7 +61,7 @@ class Section(Tagger): class Meta(Tagger): - looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE) + looks_like = re.compile(r"([^:]+): ?(.*)", re.UNICODE) def tag(self, pos): pos = self.skip_empty(pos) @@ -96,7 +96,7 @@ class Informacje(Tagger): class List(Tagger): point = re.compile(r"^[\s]*[-*·]{1,2}(.*)") - num = re.compile(r"^[\s]*[a-z]{1,2}[.]\s+(.*)") + num = re.compile(r"^[\s]*[a-z][.]\s+(.*)") def __init__(self, *args): @@ -137,7 +137,8 @@ class List(Tagger): class Paragraph(Tagger): remove_this = [ re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I), - re.compile(r"^[\s]*$") + re.compile(r"^[\s]*$"), + re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek") ] podrozdzial = [ re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I), @@ -251,7 +252,7 @@ returns auto-tagged text return toxml(content, pretty_print=pretty_print) dc_fixed = { - 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).', + 'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://edukacjamedialna.edu.pl).', 'relation': u'moduły powiązane linki', 'description.material': u'linki do załączników', 'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0', @@ -264,9 +265,6 @@ class NotFound(Exception): def find_block(content, title_re, begin=-1, end=-1): title_re = re.compile(title_re, re.I | re.UNICODE) - ## print "looking for %s" % title_re.pattern - if title_re.pattern[0:6] == 'pomoce': - import pdb; pdb.set_trace() rb = -1 if begin < 0: begin = 0 @@ -307,7 +305,7 @@ def mark_activities(content): is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I) is_next_section = re.compile(r"^[IVX]+[.]? ") - is_activity = re.compile(r"^[0-9]+[.]? (.+)") + is_activity = re.compile(r"^[0-9]+[.] (.+)") is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)") is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)") @@ -324,14 +322,16 @@ def mark_activities(content): ae = -1 while True: e = content[i] + if isinstance(e, Section): + if in_activities and \ + is_next_section.match(e.title): + in_activities = False + if isinstance(e, Paragraph): if not in_activities and \ is_przebieg.match(e.line): in_activities = True - if in_activities and \ - is_next_section.match(e.line): - in_activities = False if in_activities: m = is_activity.match(e.line) if m: @@ -389,12 +389,38 @@ def mark_dictionary(content): Container('definiens', m.groups()[1])]) else: - slowniczek.append(e) + slowniczek.append(e.line) i += 1 return content +def mark_czytelnia(content): + db = -1 + de = -1 + i = 0 + czy_czytelnia = re.compile(r"[\s]*czytelnia[\s]*", re.I) + czytelnia = content[0].spawn(List) + czytelnia.type = 'czytelnia' + while i < len(content): + e = content[i] + if isinstance(e, Section): + if czy_czytelnia.match(e.title): + db = i + 1 + elif db >= 1: + de = i + content[db:de] = [czytelnia] + break + elif db >= 0: + if isinstance(e, Paragraph): + if e.line: + czytelnia.append(e.line) + i += 1 + + return content + + + def move_evaluation(content): evaluation = [] @@ -421,6 +447,8 @@ def toxml(content, pretty_print=False): # some transformations content = mark_activities(content) content = mark_dictionary(content) + content = mark_czytelnia(content) + try: content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*") except NotFound: @@ -454,7 +482,7 @@ def toxml(content, pretty_print=False): p("") p(u'') - p(u'') + p(u'') authors = map(unicode.strip, meta[u'Autorzy'].split(u',')) for author in authors: names = author.split(u' ') @@ -473,7 +501,7 @@ def toxml(content, pretty_print=False): dc(u'description', dc_fixed['description']) dc(u'description.material', dc_fixed['description.material']) dc(u'relation', dc_fixed['relation']) - dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug) + dc(u'identifier.url', u'http://edukacjamedialna.edu.pl/%s' % slug) dc(u'rights', dc_fixed['rights']) dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/') dc(u'format', u'xml')