swap endlines inside stanzas only
authorRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Tue, 4 Jan 2011 11:01:01 +0000 (12:01 +0100)
committerRadek Czajka <radoslaw.czajka@nowoczesnapolska.org.pl>
Tue, 4 Jan 2011 11:01:01 +0000 (12:01 +0100)
librarian/parser.py
librarian/pdf.py

index 341eaf8..4cdaa79 100644 (file)
@@ -42,7 +42,7 @@ class WLDocument(object):
         return cls.from_file(StringIO(xml), *args, **kwargs)
 
     @classmethod
-    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True, preserve_lines=True):
+    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
 
         # first, prepare for parsing
         if isinstance(xmlfile, basestring):
@@ -59,18 +59,38 @@ class WLDocument(object):
 
         data = data.replace(u'\ufeff', '')
 
-        if swap_endlines:
-            sub = u'<br/>'
-            if preserve_lines:
-                sub += u'\n'
-            data = cls.LINE_SWAP_EXPR.sub(sub, data)
-
         try:
             parser = etree.XMLParser(remove_blank_text=False)
-            return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
+            tree = etree.parse(StringIO(data), parser)
+
+            if swap_endlines:
+                cls.swap_endlines(tree)
+
+            return cls(tree, parse_dublincore=parse_dublincore)
         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
             raise ParseError(e)
 
+    @classmethod
+    def swap_endlines(cls, tree):
+        # only swap inside stanzas
+        for elem in tree.iter('strofa'):
+            for child in list(elem):
+                if child.tail:
+                    chunks = cls.LINE_SWAP_EXPR.split(child.tail)
+                    ins_index = elem.index(child) + 1
+                    while len(chunks) > 1:
+                        ins = etree.Element('br')
+                        ins.tail = chunks.pop()
+                        elem.insert(ins_index, ins)
+                    child.tail = chunks.pop(0)
+            if elem.text:
+                chunks = cls.LINE_SWAP_EXPR.split(elem.text)
+                while len(chunks) > 1:
+                    ins = etree.Element('br')
+                    ins.tail = chunks.pop()
+                    elem.insert(0, ins)
+                elem.text = chunks.pop(0)
+
     def chunk(self, path):
         # convert the path to XPath
         expr = self.path_to_xpath(path)
index af68a1b..2d45372 100644 (file)
@@ -283,8 +283,7 @@ def load_including_children(provider, slug=None, uri=None, file_path=None):
         raise ValueError('Neither slug, URI nor file path provided for a book.')
 
     document = WLDocument.from_file(f, True,
-        parse_dublincore=True,
-        preserve_lines=False)
+        parse_dublincore=True)
 
     f.close()