Fix dictionary auto-tagging

[redakcja.git] / apps / catalogue / management / edumed.py
diff --git a/apps/catalogue/management/edumed.py b/apps/catalogue/management/edumed.py

index e5d5ee0..250eccb 100644 (file)
--- a/apps/catalogue/management/edumed.py
+++ b/apps/catalogue/management/edumed.py
@@ -15,11 +15,14 @@ class Tagger:
      def line(self, position):
          return self.lines[position]
  
-    empty_line = re.compile(r"\s+")
+    ignore = [re.compile(r"^[\[][PA][\]] - [^ ]+$")]
+    empty_line = re.compile(r"^\s+$")
      
      def skip_empty(self, position):
          while self.line(position) == "" or \
-            self.empty_line.match(self.line(position)):
+            self.empty_line.match(self.line(position)) or \
+            filter(lambda r: r.match(self.line(position)),
+                             self.ignore[:]):
              position += 1
          return position
  
@@ -32,6 +35,11 @@ Return None -- means that we can't tag it in any way
      def wrap(self, tagname, content):
          return u"<%s>%s</%s>" % (tagname, content, tagname)
  
+    @staticmethod
+    def anymatches(regex):
+        return lambda x: regex.match(x)
+        
+
  
  class Section(Tagger):
      looks_like = re.compile(r"^[IVX]+[.]\s+(.*)$")
@@ -83,11 +91,14 @@ class Informacje(Tagger):
  
  
  class List(Tagger):
+    point = re.compile(r"^[\s]*([-*])")
+    
      def tag(self, pos):
          self.items = []
          while True:
              l = self.line(pos)
-            if l and l[0] in ('-', '*'):
+            m = self.point.match(l)
+            if l and m:
                  self.items.append(l[1:].strip())
                  pos += 1
              else:
@@ -96,16 +107,16 @@ class List(Tagger):
              return pos
  
      def __unicode__(self):
-        s = "<lista>\n"
+        s = '<lista typ="punkt">'
          for i in self.items:
-            s += "<punkt>%s</punkt>\n" % i
-        s += "</lista>\n"
+            s += "\n<punkt>%s</punkt>" % i
+        s += "\n</lista>\n"
          return s
  
  
  class Paragraph(Tagger):
      remove_this = [
-        re.compile(r"[\s]*opis zawartości[\s]*", re.I),
+        re.compile(r"[\s]*opis zawarto.ci[\s]*", re.I),
          re.compile(r"^[\s]*$")
          ]
      podrozdzial = [
@@ -215,7 +226,7 @@ returns auto-tagged text
              pos += 1
              if pos >= len(lines):
                  break
-
+            
      return toxml(content)
  
  dc_fixed = {
@@ -304,6 +315,7 @@ def mark_dictionary(content):
              elif db >= 1:
                  de = i
                  content[db:de] = [Container('slowniczek', *slowniczek)]
+                break
          elif db >= 0:
              if isinstance(e, Paragraph):
                  m = is_dictentry.match(e.line)