master xml making script

[redakcja.git] / apps / catalogue / management / edumed.py
diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py

index b1e67c3..25f3084 100644 (file)
--- a/apps/catalogue/management/edumed.py
+++ b/apps/catalogue/management/edumed.py
@@ -61,7 +61,7 @@ class Section(Tagger):
  
  
  class Meta(Tagger):
-    looks_like = re.compile(r"([^:]+): (.*)", re.UNICODE)
+    looks_like = re.compile(r"([^:]+): ?(.*)", re.UNICODE)
  
      def tag(self, pos):
          pos = self.skip_empty(pos)
@@ -96,7 +96,7 @@ class Informacje(Tagger):
  
  class List(Tagger):
      point = re.compile(r"^[\s]*[-*·]{1,2}(.*)")
-    num = re.compile(r"^[\s]*[a-z]{1,2}[.]\s+(.*)")
+    num = re.compile(r"^[\s]*[a-z][.]\s+(.*)")
  
      def __init__(self, *args):
  
@@ -137,7 +137,8 @@ class List(Tagger):
  class Paragraph(Tagger):
      remove_this = [
          re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
-        re.compile(r"^[\s]*$")
+        re.compile(r"^[\s]*$"),
+        re.compile(r"http://pad.nowoczesnapolska.org.pl/p/slowniczek")
          ]
      podrozdzial = [
          re.compile(r"[\s]*(przebieg zaj..|opcje dodatkowe)[\s]*", re.I),
@@ -251,18 +252,19 @@ returns auto-tagged text
      return toxml(content, pretty_print=pretty_print)
  
  dc_fixed = {
-    'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://cyfrowaprzyszlosc.pl).',
+    'description': u'Publikacja zrealizowana w ramach projektu Cyfrowa Przyszłość (http://edukacjamedialna.edu.pl).',
      'relation': u'moduły powiązane linki',
      'description.material': u'linki do załączników',
      'rights': u'Creative Commons Uznanie autorstwa - Na tych samych warunkach 3.0',
      }
  
  
+class NotFound(Exception):
+    pass
+
+
  def find_block(content, title_re, begin=-1, end=-1):
      title_re = re.compile(title_re, re.I | re.UNICODE)
-    ##   print "looking for %s" % title_re.pattern
-    if title_re.pattern[0:6] == 'pomoce':
-        import pdb; pdb.set_trace()
  
      rb = -1
      if begin < 0: begin = 0
@@ -286,11 +288,11 @@ def find_block(content, title_re, begin=-1, end=-1):
              break
      if rb >= 0:
          return rb, i
+    raise NotFound()
  
  
  def remove_block(content, title_re, removed=None):
      rb, re = find_block(content, title_re)
-
      if removed is not None and isinstance(removed, list):
          removed += content[rb:re][:]
      content[rb:re] = []
@@ -303,7 +305,7 @@ def mark_activities(content):
      is_przebieg = re.compile(r"[\s]*przebieg zaj..[\s]*", re.I)
  
      is_next_section = re.compile(r"^[IVX]+[.]? ")
-    is_activity = re.compile(r"^[0-9]+[.]? (.+)")
+    is_activity = re.compile(r"^[0-9]+[.] (.+)")
  
      is_activity_tools = re.compile(r"^pomoce:[\s]*(.+)")
      is_activity_work = re.compile(r"^forma pracy:[\s]*(.+)")
@@ -320,14 +322,16 @@ def mark_activities(content):
      ae = -1
      while True:
          e = content[i]
+        if isinstance(e, Section):
+            if in_activities and \
+                is_next_section.match(e.title):
+                in_activities = False
+            
          if isinstance(e, Paragraph):
              if not in_activities and \
                  is_przebieg.match(e.line):
                  in_activities = True
  
-            if in_activities and \
-                is_next_section.match(e.line):
-                in_activities = False
              if in_activities:
                  m = is_activity.match(e.line)
                  if m:
@@ -385,12 +389,38 @@ def mark_dictionary(content):
                                         Container('definiens', m.groups()[1])])
  
                  else:
-                    slowniczek.append(e)
+                    slowniczek.append(e.line)
+        i += 1
+
+    return content
+
+
+def mark_czytelnia(content):
+    db = -1
+    de = -1
+    i = 0
+    czy_czytelnia = re.compile(r"[\s]*czytelnia[\s]*", re.I)
+    czytelnia = content[0].spawn(List)
+    czytelnia.type = 'czytelnia'
+    while i < len(content):
+        e = content[i]
+        if isinstance(e, Section):
+            if czy_czytelnia.match(e.title):
+                db = i + 1
+            elif db >= 1:
+                de = i
+                content[db:de] = [czytelnia]
+                break
+        elif db >= 0:
+            if isinstance(e, Paragraph):
+                if e.line:
+                    czytelnia.append(e.line)
          i += 1
  
      return content
  
  
+
  def move_evaluation(content):
      evaluation = []
  
@@ -417,8 +447,16 @@ def toxml(content, pretty_print=False):
      # some transformations
      content = mark_activities(content)
      content = mark_dictionary(content)
-    content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
-    content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+    content = mark_czytelnia(content)
+    
+    try:
+        content = remove_block(content, r"wykorzyst(yw)?ane metody[+ PA\[\].]*")
+    except NotFound:
+        pass
+    try:
+        content = remove_block(content, r"(pomoce|potrzebne materia.y)[+ PA\[\]]*")
+    except NotFound:
+        pass
      content = move_evaluation(content)
  
      info = content.pop(0)
@@ -444,7 +482,7 @@ def toxml(content, pretty_print=False):
  
      p("<utwor>")
      p(u'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">')
-    p(u'<rdf:Description rdf:about="http://redakcja.cyfrowaprzyszlosc.pl/documents/">')
+    p(u'<rdf:Description rdf:about="http://redakcja.edukacjamedialna.edu.pl/documents/">')
      authors = map(unicode.strip, meta[u'Autorzy'].split(u','))
      for author in authors:
          names = author.split(u' ')
@@ -463,7 +501,7 @@ def toxml(content, pretty_print=False):
      dc(u'description', dc_fixed['description'])
      dc(u'description.material', dc_fixed['description.material'])
      dc(u'relation', dc_fixed['relation'])
-    dc(u'identifier.url', u'http://cyfrowaprzyszlosc.pl/%s' % slug)
+    dc(u'identifier.url', u'http://edukacjamedialna.edu.pl/%s' % slug)
      dc(u'rights', dc_fixed['rights'])
      dc(u'rights.license', u'http://creativecommons.org/licenses/by-sa/3.0/')
      dc(u'format', u'xml')