librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
   7 from librarian import RDFNS, IOFile
   8 from librarian import dcparser
   9
  10 from xml.parsers.expat import ExpatError
  11 from lxml import etree
  12 from lxml.etree import XMLSyntaxError, XSLTApplyError
  13
  14 import os
  15 import re
  16 from StringIO import StringIO
  17
  18 class WLDocument(object):
  19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
  20     provider = None
  21
  22     _edoc = None
  23     @property
  24     def edoc(self):
  25         if self._edoc is None:
  26             data = self.source.get_string()
  27             if not isinstance(data, unicode):
  28                 data = data.decode('utf-8')
  29             data = data.replace(u'\ufeff', '')
  30             try:
  31                 parser = etree.XMLParser(remove_blank_text=False)
  32                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
  33             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  34                 raise ParseError(e)
  35         return self._edoc
  36
  37     _rdf_elem = None
  38     @property
  39     def rdf_elem(self):
  40         if self._rdf_elem is None:
  41             dc_path = './/' + RDFNS('RDF')
  42             self._rdf_elem = self.edoc.getroot().find(dc_path)
  43             if self._rdf_elem is None:
  44                 raise NoDublinCore('Document has no DublinCore - which is required.')
  45         return self._rdf_elem
  46
  47     _book_info = None
  48     @property
  49     def book_info(self):
  50         if not self.parse_dublincore:
  51             return None
  52         if self._book_info is None:
  53             self._book_info = dcparser.BookInfo.from_element(
  54                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
  55         return self._book_info
  56
  57     def __init__(self, iofile, provider=None,
  58             parse_dublincore=True, # shouldn't it be in a subclass?
  59             strict=False, # ?
  60             meta_fallbacks=None # ?
  61             ):
  62         self.source = iofile
  63         self.provider = provider
  64         self.parse_dublincore = parse_dublincore
  65         self.strict = strict
  66         self.meta_fallbacks = meta_fallbacks
  67         if self.edoc.getroot().tag != 'utwor':
  68             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  69         if parse_dublincore:
  70             self.book_info
  71
  72     @classmethod
  73     def from_string(cls, xml, *args, **kwargs):
  74         return cls(IOFile.from_string(xml), *args, **kwargs)
  75
  76     @classmethod
  77     def from_file(cls, xmlfile, *args, **kwargs):
  78         iofile = IOFile.from_filename(xmlfile)
  79         return cls(iofile, *args, **kwargs)
  80
  81
  82     def swap_endlines(self):
  83         """Converts line breaks in stanzas into <br/> tags."""
  84         # only swap inside stanzas
  85         for elem in self.edoc.iter('strofa'):
  86             for child in list(elem):
  87                 if child.tail:
  88                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
  89                     ins_index = elem.index(child) + 1
  90                     while len(chunks) > 1:
  91                         ins = etree.Element('br')
  92                         ins.tail = chunks.pop()
  93                         elem.insert(ins_index, ins)
  94                     child.tail = chunks.pop(0)
  95             if elem.text:
  96                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
  97                 while len(chunks) > 1:
  98                     ins = etree.Element('br')
  99                     ins.tail = chunks.pop()
 100                     elem.insert(0, ins)
 101                 elem.text = chunks.pop(0)
 102
 103     def chunk(self, path):
 104         # convert the path to XPath
 105         expr = self.path_to_xpath(path)
 106         elems = self.edoc.xpath(expr)
 107
 108         if len(elems) == 0:
 109             return None
 110         else:
 111             return elems[0]
 112
 113     def path_to_xpath(self, path):
 114         parts = []
 115
 116         for part in path.split('/'):
 117             match = re.match(r'([^\[]+)\[(\d+)\]', part)
 118             if not match:
 119                 parts.append(part)
 120             else:
 121                 tag, n = match.groups()
 122                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
 123
 124         if parts[0] == '.':
 125             parts[0] = ''
 126
 127         return '/'.join(parts)
 128
 129     def transform(self, stylesheet, **options):
 130         return self.edoc.xslt(stylesheet, **options)
 131
 132     def update_dc(self):
 133         if self.book_info:
 134             parent = self.rdf_elem.getparent()
 135             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
 136
 137     def serialize(self):
 138         self.update_dc()
 139         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 140
 141     def merge_chunks(self, chunk_dict):
 142         unmerged = []
 143
 144         for key, data in chunk_dict.iteritems():
 145             try:
 146                 xpath = self.path_to_xpath(key)
 147                 node = self.edoc.xpath(xpath)[0]
 148                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
 149                 node.getparent().replace(node, repl)
 150             except Exception, e:
 151                 unmerged.append( repr( (key, xpath, e) ) )
 152
 153         return unmerged
 154
 155     def clean_ed_note(self):
 156         """ deletes forbidden tags from nota_red """
 157
 158         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
 159                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
 160             tail = node.tail
 161             node.clear()
 162             node.tag = 'span'
 163             node.tail = tail
 164
 165     # Converters
 166
 167     def as_html(self, *args, **kwargs):
 168         from librarian import pyhtml as html
 169         return html.transform(self, *args, **kwargs)
 170
 171     def as_text(self, *args, **kwargs):
 172         from librarian import text
 173         return text.transform(self, *args, **kwargs)
 174
 175     def as_epub(self, *args, **kwargs):
 176         from librarian import epub
 177         return epub.transform(self, *args, **kwargs)
 178
 179     def as_pdf(self, *args, **kwargs):
 180         from librarian import pypdf
 181         return pypdf.EduModulePDFFormat(self).build(*args, **kwargs)
 182
 183     def as_mobi(self, *args, **kwargs):
 184         from librarian import mobi
 185         return mobi.transform(self, *args, **kwargs)
 186
 187     def as_fb2(self, *args, **kwargs):
 188         from librarian import fb2
 189         return fb2.transform(self, *args, **kwargs)
 190
 191     def as_cover(self, cover_class=None, *args, **kwargs):
 192         if cover_class is None:
 193             from librarian.styles.wolnelektury.cover import WLCover
 194             cover_class = WLCover
 195         return cover_class(self.book_info, *args, **kwargs).output_file()
 196
 197     def save_output_file(self, output_file, output_path=None,
 198             output_dir_path=None, make_author_dir=False, ext=None):
 199         if output_dir_path:
 200             save_path = output_dir_path
 201             if make_author_dir:
 202                 save_path = os.path.join(save_path,
 203                         unicode(self.book_info.author).encode('utf-8'))
 204             save_path = os.path.join(save_path,
 205                                 self.book_info.uri.slug)
 206             if ext:
 207                 save_path += '.%s' % ext
 208         else:
 209             save_path = output_path
 210
 211         output_file.save_as(save_path)