suppress xmlns:... attributes in epub
[librarian.git] / librarian / parser.py
index 4cdaa79..502192f 100644 (file)
@@ -3,22 +3,28 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from librarian import ValidationError, NoDublinCore,  ParseError
+from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
 from librarian import RDFNS
 from librarian import RDFNS
+from librarian.cover import DefaultEbookCover
 from librarian import dcparser
 
 from xml.parsers.expat import ExpatError
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
 from librarian import dcparser
 
 from xml.parsers.expat import ExpatError
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
+import os
 import re
 from StringIO import StringIO
 
 import re
 from StringIO import StringIO
 
+
 class WLDocument(object):
 class WLDocument(object):
-    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
+    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
+    provider = None
 
 
-    def __init__(self, edoc, parse_dublincore=True):
+    def __init__(self, edoc, parse_dublincore=True, provider=None, 
+                 strict=False, meta_fallbacks=None):
         self.edoc = edoc
         self.edoc = edoc
+        self.provider = provider
 
         root_elem = edoc.getroot()
 
 
         root_elem = edoc.getroot()
 
@@ -33,7 +39,8 @@ class WLDocument(object):
             if self.rdf_elem is None:
                 raise NoDublinCore('Document has no DublinCore - which is required.')
 
             if self.rdf_elem is None:
                 raise NoDublinCore('Document has no DublinCore - which is required.')
 
-            self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
+            self.book_info = dcparser.BookInfo.from_element(
+                    self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
         else:
             self.book_info = None
 
         else:
             self.book_info = None
 
@@ -42,7 +49,7 @@ class WLDocument(object):
         return cls.from_file(StringIO(xml), *args, **kwargs)
 
     @classmethod
         return cls.from_file(StringIO(xml), *args, **kwargs)
 
     @classmethod
-    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
+    def from_file(cls, xmlfile, *args, **kwargs):
 
         # first, prepare for parsing
         if isinstance(xmlfile, basestring):
 
         # first, prepare for parsing
         if isinstance(xmlfile, basestring):
@@ -61,22 +68,19 @@ class WLDocument(object):
 
         try:
             parser = etree.XMLParser(remove_blank_text=False)
 
         try:
             parser = etree.XMLParser(remove_blank_text=False)
-            tree = etree.parse(StringIO(data), parser)
-
-            if swap_endlines:
-                cls.swap_endlines(tree)
+            tree = etree.parse(StringIO(data.encode('utf-8')), parser)
 
 
-            return cls(tree, parse_dublincore=parse_dublincore)
+            return cls(tree, *args, **kwargs)
         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
             raise ParseError(e)
 
         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
             raise ParseError(e)
 
-    @classmethod
-    def swap_endlines(cls, tree):
+    def swap_endlines(self):
+        """Converts line breaks in stanzas into <br/> tags."""
         # only swap inside stanzas
         # only swap inside stanzas
-        for elem in tree.iter('strofa'):
+        for elem in self.edoc.iter('strofa'):
             for child in list(elem):
                 if child.tail:
             for child in list(elem):
                 if child.tail:
-                    chunks = cls.LINE_SWAP_EXPR.split(child.tail)
+                    chunks = self.LINE_SWAP_EXPR.split(child.tail)
                     ins_index = elem.index(child) + 1
                     while len(chunks) > 1:
                         ins = etree.Element('br')
                     ins_index = elem.index(child) + 1
                     while len(chunks) > 1:
                         ins = etree.Element('br')
@@ -84,13 +88,21 @@ class WLDocument(object):
                         elem.insert(ins_index, ins)
                     child.tail = chunks.pop(0)
             if elem.text:
                         elem.insert(ins_index, ins)
                     child.tail = chunks.pop(0)
             if elem.text:
-                chunks = cls.LINE_SWAP_EXPR.split(elem.text)
+                chunks = self.LINE_SWAP_EXPR.split(elem.text)
                 while len(chunks) > 1:
                     ins = etree.Element('br')
                     ins.tail = chunks.pop()
                     elem.insert(0, ins)
                 elem.text = chunks.pop(0)
 
                 while len(chunks) > 1:
                     ins = etree.Element('br')
                     ins.tail = chunks.pop()
                     elem.insert(0, ins)
                 elem.text = chunks.pop(0)
 
+    def parts(self):
+        if self.provider is None:
+            raise NoProvider('No document provider supplied.')
+        if self.book_info is None:
+            raise NoDublinCore('No Dublin Core in document.')
+        for part_uri in self.book_info.parts:
+            yield self.from_file(self.provider.by_uri(part_uri), provider=self.provider)
+
     def chunk(self, path):
         # convert the path to XPath
         expr = self.path_to_xpath(path)
     def chunk(self, path):
         # convert the path to XPath
         expr = self.path_to_xpath(path)
@@ -110,7 +122,7 @@ class WLDocument(object):
                 parts.append(part)
             else:
                 tag, n = match.groups()
                 parts.append(part)
             else:
                 tag, n = match.groups()
-                parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
+                parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
 
         if parts[0] == '.':
             parts[0] = ''
 
         if parts[0] == '.':
             parts[0] = ''
@@ -123,7 +135,7 @@ class WLDocument(object):
     def update_dc(self):
         if self.book_info:
             parent = self.rdf_elem.getparent()
     def update_dc(self):
         if self.book_info:
             parent = self.rdf_elem.getparent()
-            parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
+            parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
 
     def serialize(self):
         self.update_dc()
 
     def serialize(self):
         self.update_dc()
@@ -136,19 +148,77 @@ class WLDocument(object):
             try:
                 xpath = self.path_to_xpath(key)
                 node = self.edoc.xpath(xpath)[0]
             try:
                 xpath = self.path_to_xpath(key)
                 node = self.edoc.xpath(xpath)[0]
-                repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
-                node.getparent().replace(node, repl);
+                repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
+                node.getparent().replace(node, repl)
             except Exception, e:
             except Exception, e:
-                unmerged.append( repr( (key, xpath, e) ) )
+                unmerged.append(repr((key, xpath, e)))
 
         return unmerged
 
 
         return unmerged
 
-    def clean_ed_note(self):
+    def clean_ed_note(self, note_tag='nota_red'):
         """ deletes forbidden tags from nota_red """
 
         """ deletes forbidden tags from nota_red """
 
-        for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
-                    ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
+        for node in self.edoc.xpath('|'.join('//%s//%s' % (note_tag, tag) for tag in
+                                    ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
             tail = node.tail
             node.clear()
             node.tag = 'span'
             node.tail = tail
             tail = node.tail
             node.clear()
             node.tag = 'span'
             node.tail = tail
+
+    def editors(self):
+        """Returns a set of all editors for book and its children.
+
+        :returns: set of dcparser.Person objects
+        """
+        if self.book_info is None:
+            raise NoDublinCore('No Dublin Core in document.')
+        persons = set(self.book_info.editors + self.book_info.technical_editors)
+        for child in self.parts():
+            persons.update(child.editors())
+        if None in persons:
+            persons.remove(None)
+        return persons
+
+    # Converters
+
+    def as_html(self, *args, **kwargs):
+        from librarian import html
+        return html.transform(self, *args, **kwargs)
+
+    def as_text(self, *args, **kwargs):
+        from librarian import text
+        return text.transform(self, *args, **kwargs)
+
+    def as_epub(self, *args, **kwargs):
+        from librarian import epub
+        return epub.transform(self, *args, **kwargs)
+
+    def as_pdf(self, *args, **kwargs):
+        from librarian import pdf
+        return pdf.transform(self, *args, **kwargs)
+
+    def as_mobi(self, *args, **kwargs):
+        from librarian import mobi
+        return mobi.transform(self, *args, **kwargs)
+
+    def as_fb2(self, *args, **kwargs):
+        from librarian import fb2
+        return fb2.transform(self, *args, **kwargs)
+
+    def as_cover(self, cover_class=None, *args, **kwargs):
+        if cover_class is None:
+            cover_class = DefaultEbookCover
+        return cover_class(self.book_info, *args, **kwargs).output_file()
+
+    def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
+        if output_dir_path:
+            save_path = output_dir_path
+            if make_author_dir:
+                save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8'))
+            save_path = os.path.join(save_path, self.book_info.uri.slug)
+            if ext:
+                save_path += '.%s' % ext
+        else:
+            save_path = output_path
+
+        output_file.save_as(save_path)