slowniczek
[librarian.git] / librarian / parser.py
index 225000b..f02b64c 100644 (file)
 # -*- coding: utf-8 -*-
 #
 # -*- coding: utf-8 -*-
 #
-#    This file is part of Librarian.
+# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 #
-#    Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska <fundacja@nowoczesnapolska.org.pl>
-#    
-#    For full list of contributors see AUTHORS file. 
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU Affero General Public License as published by
-#    the Free Software Foundation, either version 3 of the License, or
-#    (at your option) any later version.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU Affero General Public License for more details.
-#
-#    You should have received a copy of the GNU Affero General Public License
-#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-from librarian import ValidationError, NoDublinCore,  ParseError
-from librarian import RDFNS
+from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
+from librarian import RDFNS, IOFile
 from librarian import dcparser
 
 from xml.parsers.expat import ExpatError
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
 from librarian import dcparser
 
 from xml.parsers.expat import ExpatError
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
+import os
 import re
 from StringIO import StringIO
 
 class WLDocument(object):
 import re
 from StringIO import StringIO
 
 class WLDocument(object):
-    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
-
-    def __init__(self, edoc, parse_dublincore=True):
-        self.edoc = edoc
+    LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
+    provider = None
+
+    _edoc = None
+    @property
+    def edoc(self):
+        if self._edoc is None:
+            data = self.source.get_string()
+            if not isinstance(data, unicode):
+                data = data.decode('utf-8')
+            data = data.replace(u'\ufeff', '')
+            try:
+                parser = etree.XMLParser(remove_blank_text=False)
+                self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
+            except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
+                raise ParseError(e)
+        return self._edoc
+
+    _rdf_elem = None
+    @property
+    def rdf_elem(self):
+        if self._rdf_elem is None:
+            dc_path = './/' + RDFNS('RDF')
+            self._rdf_elem = self.edoc.getroot().find(dc_path)
+            if self._rdf_elem is None:
+                raise NoDublinCore('Document has no DublinCore - which is required.')
+        return self._rdf_elem
 
 
-        root_elem = edoc.getroot()
-       
-        dc_path = './/' + RDFNS('RDF')
-        
-        if root_elem.tag != 'utwor':
+    _book_info = None
+    @property
+    def book_info(self):
+        if not self.parse_dublincore:
+            return None
+        if self._book_info is None:
+            self._book_info = dcparser.BookInfo.from_element(
+                    self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
+        return self._book_info
+
+    def __init__(self, iofile, provider=None, 
+            parse_dublincore=True, # shouldn't it be in a subclass?
+            strict=False, # ?
+            meta_fallbacks=None # ?
+            ):
+        self.source = iofile
+        self.provider = provider
+        self.parse_dublincore = parse_dublincore
+        self.strict = strict
+        self.meta_fallbacks = meta_fallbacks
+        if self.edoc.getroot().tag != 'utwor':
             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
-
         if parse_dublincore:
         if parse_dublincore:
-            self.rdf_elem = root_elem.find(dc_path)
+            self.book_info
 
 
-            if self.rdf_elem is None:
-                raise NoDublinCore('Document has no DublinCore - which is required.')
-            
-            self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
-        else:
-            self.book_info = None
-    
     @classmethod
     @classmethod
-    def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
-        return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
+    def from_string(cls, xml, *args, **kwargs):
+        return cls(IOFile.from_string(xml), *args, **kwargs)
 
     @classmethod
 
     @classmethod
-    def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
-
-        # first, prepare for parsing
+    def from_file(cls, xmlfile, *args, **kwargs):
         if isinstance(xmlfile, basestring):
         if isinstance(xmlfile, basestring):
-            file = open(xmlfile, 'rb')
-            try:
-                data = file.read()
-            finally:
-                file.close()
+            iofile = IOFile.from_filename(xmlfile)
         else:
         else:
-            data = xmlfile.read()
-
-        if not isinstance(data, unicode):
-            data = data.decode('utf-8')
-
-        if swap_endlines:
-            data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
-    
-        try:
-            parser = etree.XMLParser(remove_blank_text=False)
-            return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
-        except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
-            raise ParseError(e)                  
+            iofile = IOFile.from_file(xmlfile)
+        return cls(iofile, *args, **kwargs)
+
+
+    def swap_endlines(self):
+        """Converts line breaks in stanzas into <br/> tags."""
+        # only swap inside stanzas
+        for elem in self.edoc.iter('strofa'):
+            for child in list(elem):
+                if child.tail:
+                    chunks = self.LINE_SWAP_EXPR.split(child.tail)
+                    ins_index = elem.index(child) + 1
+                    while len(chunks) > 1:
+                        ins = etree.Element('br')
+                        ins.tail = chunks.pop()
+                        elem.insert(ins_index, ins)
+                    child.tail = chunks.pop(0)
+            if elem.text:
+                chunks = self.LINE_SWAP_EXPR.split(elem.text)
+                while len(chunks) > 1:
+                    ins = etree.Element('br')
+                    ins.tail = chunks.pop()
+                    elem.insert(0, ins)
+                elem.text = chunks.pop(0)
+
+    def parts(self):
+        if self.book_info is None:
+            raise NoDublinCore('No Dublin Core in document.')
+        if self.book_info.parts and self.provider is None:
+            raise NoProvider('No document provider supplied.')
+        for part_uri in self.book_info.parts:
+            yield self.from_file(self.provider.by_uri(part_uri),
+                    provider=self.provider)
 
     def chunk(self, path):
 
     def chunk(self, path):
-        # convert the path to XPath        
+        # convert the path to XPath
         expr = self.path_to_xpath(path)
         elems = self.edoc.xpath(expr)
 
         if len(elems) == 0:
             return None
         expr = self.path_to_xpath(path)
         elems = self.edoc.xpath(expr)
 
         if len(elems) == 0:
             return None
-        else:        
+        else:
             return elems[0]
 
     def path_to_xpath(self, path):
             return elems[0]
 
     def path_to_xpath(self, path):
@@ -128,8 +158,81 @@ class WLDocument(object):
                 xpath = self.path_to_xpath(key)
                 node = self.edoc.xpath(xpath)[0]
                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
                 xpath = self.path_to_xpath(key)
                 node = self.edoc.xpath(xpath)[0]
                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
-                node.getparent().replace(node, repl);
+                node.getparent().replace(node, repl)
             except Exception, e:
                 unmerged.append( repr( (key, xpath, e) ) )
 
             except Exception, e:
                 unmerged.append( repr( (key, xpath, e) ) )
 
-        return unmerged
\ No newline at end of file
+        return unmerged
+
+    def clean_ed_note(self):
+        """ deletes forbidden tags from nota_red """
+
+        for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
+                    ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
+            tail = node.tail
+            node.clear()
+            node.tag = 'span'
+            node.tail = tail
+
+    def editors(self):
+        """Returns a set of all editors for book and its children.
+
+        :returns: set of dcparser.Person objects
+        """
+        if self.book_info is None:
+            raise NoDublinCore('No Dublin Core in document.')
+        persons = set(self.book_info.editors +
+                        self.book_info.technical_editors)
+        for child in self.parts():
+            persons.update(child.editors())
+        if None in persons:
+            persons.remove(None)
+        return persons
+
+    # Converters
+
+    def as_html(self, *args, **kwargs):
+        from librarian import pyhtml as html
+        return html.transform(self, *args, **kwargs)
+
+    def as_text(self, *args, **kwargs):
+        from librarian import text
+        return text.transform(self, *args, **kwargs)
+
+    def as_epub(self, *args, **kwargs):
+        from librarian import epub
+        return epub.transform(self, *args, **kwargs)
+
+    def as_pdf(self, *args, **kwargs):
+        from librarian import pdf
+        return pdf.transform(self, *args, **kwargs)
+
+    def as_mobi(self, *args, **kwargs):
+        from librarian import mobi
+        return mobi.transform(self, *args, **kwargs)
+
+    def as_fb2(self, *args, **kwargs):
+        from librarian import fb2
+        return fb2.transform(self, *args, **kwargs)
+
+    def as_cover(self, cover_class=None, *args, **kwargs):
+        if cover_class is None:
+            from librarian.cover import WLCover
+            cover_class = WLCover
+        return cover_class(self.book_info, *args, **kwargs).output_file()
+
+    def save_output_file(self, output_file, output_path=None,
+            output_dir_path=None, make_author_dir=False, ext=None):
+        if output_dir_path:
+            save_path = output_dir_path
+            if make_author_dir:
+                save_path = os.path.join(save_path,
+                        unicode(self.book_info.author).encode('utf-8'))
+            save_path = os.path.join(save_path,
+                                self.book_info.uri.slug)
+            if ext:
+                save_path += '.%s' % ext
+        else:
+            save_path = output_path
+
+        output_file.save_as(save_path)