some epub/mobi fixes

[librarian.git] / librarian / parser.py
diff --git a/librarian/parser.py b/librarian/parser.py

index 341eaf8..afc4f1a 100644 (file)
--- a/librarian/parser.py
+++ b/librarian/parser.py
@@ -42,7 +42,7 @@ class WLDocument(object):
          return cls.from_file(StringIO(xml), *args, **kwargs)
  
      @classmethod
-    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True, preserve_lines=True):
+    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
  
          # first, prepare for parsing
          if isinstance(xmlfile, basestring):
@@ -59,18 +59,38 @@ class WLDocument(object):
  
          data = data.replace(u'\ufeff', '')
  
-        if swap_endlines:
-            sub = u'<br/>'
-            if preserve_lines:
-                sub += u'\n'
-            data = cls.LINE_SWAP_EXPR.sub(sub, data)
-
          try:
              parser = etree.XMLParser(remove_blank_text=False)
-            return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
+            tree = etree.parse(StringIO(data.encode('utf-8')), parser)
+
+            if swap_endlines:
+                cls.swap_endlines(tree)
+
+            return cls(tree, parse_dublincore=parse_dublincore)
          except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
              raise ParseError(e)
  
+    @classmethod
+    def swap_endlines(cls, tree):
+        # only swap inside stanzas
+        for elem in tree.iter('strofa'):
+            for child in list(elem):
+                if child.tail:
+                    chunks = cls.LINE_SWAP_EXPR.split(child.tail)
+                    ins_index = elem.index(child) + 1
+                    while len(chunks) > 1:
+                        ins = etree.Element('br')
+                        ins.tail = chunks.pop()
+                        elem.insert(ins_index, ins)
+                    child.tail = chunks.pop(0)
+            if elem.text:
+                chunks = cls.LINE_SWAP_EXPR.split(elem.text)
+                while len(chunks) > 1:
+                    ins = etree.Element('br')
+                    ins.tail = chunks.pop()
+                    elem.insert(0, ins)
+                elem.text = chunks.pop(0)
+
      def chunk(self, path):
          # convert the path to XPath
          expr = self.path_to_xpath(path)