librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from librarian import ValidationError, NoDublinCore,  ParseError
   7 from librarian import RDFNS, IOFile
   8 from librarian import dcparser
   9
  10 from xml.parsers.expat import ExpatError
  11 from lxml import etree
  12 from lxml.etree import XMLSyntaxError, XSLTApplyError
  13
  14 import os
  15 import re
  16 from StringIO import StringIO
  17
  18
  19 class WLDocument(object):
  20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
  21     provider = None
  22
  23     _edoc = None
  24
  25     @property
  26     def edoc(self):
  27         if self._edoc is None:
  28             data = self.source.get_string()
  29             if not isinstance(data, unicode):
  30                 data = data.decode('utf-8')
  31             data = data.replace(u'\ufeff', '')
  32             try:
  33                 parser = etree.XMLParser()
  34                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
  35             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  36                 raise ParseError(e)
  37         return self._edoc
  38
  39     _rdf_elem = None
  40
  41     @property
  42     def rdf_elem(self):
  43         if self._rdf_elem is None:
  44             dc_path = './/' + RDFNS('RDF')
  45             self._rdf_elem = self.edoc.getroot().find(dc_path)
  46             if self._rdf_elem is None:
  47                 raise NoDublinCore('Document has no DublinCore - which is required.')
  48         return self._rdf_elem
  49
  50     _book_info = None
  51
  52     @property
  53     def book_info(self):
  54         if not self.parse_dublincore:
  55             return None
  56         if self._book_info is None:
  57             self._book_info = dcparser.BookInfo.from_element(
  58                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
  59         return self._book_info
  60
  61     def __init__(self, iofile, provider=None, parse_dublincore=True,  # shouldn't it be in a subclass?
  62                  strict=False,  # ?
  63                  meta_fallbacks=None):  # ?
  64         self.source = iofile
  65         self.provider = provider
  66         self.parse_dublincore = parse_dublincore
  67         self.strict = strict
  68         self.meta_fallbacks = meta_fallbacks
  69         root_elem = self.edoc.getroot()
  70         if root_elem.tag != 'utwor':
  71             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  72         if parse_dublincore:
  73             self.book_info
  74
  75     @classmethod
  76     def from_string(cls, xml, *args, **kwargs):
  77         return cls(IOFile.from_string(xml), *args, **kwargs)
  78
  79     @classmethod
  80     def from_file(cls, xmlfile, *args, **kwargs):
  81         iofile = IOFile.from_filename(xmlfile)
  82         return cls(iofile, *args, **kwargs)
  83
  84     def swap_endlines(self):
  85         """Converts line breaks in stanzas into <br/> tags."""
  86         # only swap inside stanzas
  87         for elem in self.edoc.iter('strofa'):
  88             for child in list(elem):
  89                 if child.tail:
  90                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
  91                     ins_index = elem.index(child) + 1
  92                     while len(chunks) > 1:
  93                         ins = etree.Element('br')
  94                         ins.tail = chunks.pop()
  95                         elem.insert(ins_index, ins)
  96                     child.tail = chunks.pop(0)
  97             if elem.text:
  98                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
  99                 while len(chunks) > 1:
 100                     ins = etree.Element('br')
 101                     ins.tail = chunks.pop()
 102                     elem.insert(0, ins)
 103                 elem.text = chunks.pop(0)
 104
 105     def chunk(self, path):
 106         # convert the path to XPath
 107         expr = self.path_to_xpath(path)
 108         elems = self.edoc.xpath(expr)
 109
 110         if len(elems) == 0:
 111             return None
 112         else:
 113             return elems[0]
 114
 115     def path_to_xpath(self, path):
 116         parts = []
 117
 118         for part in path.split('/'):
 119             match = re.match(r'([^\[]+)\[(\d+)\]', part)
 120             if not match:
 121                 parts.append(part)
 122             else:
 123                 tag, n = match.groups()
 124                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
 125
 126         if parts[0] == '.':
 127             parts[0] = ''
 128
 129         return '/'.join(parts)
 130
 131     def transform(self, stylesheet, **options):
 132         return self.edoc.xslt(stylesheet, **options)
 133
 134     def update_dc(self):
 135         if self.book_info:
 136             parent = self.rdf_elem.getparent()
 137             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
 138
 139     def serialize(self):
 140         self.update_dc()
 141         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 142
 143     def merge_chunks(self, chunk_dict):
 144         unmerged = []
 145
 146         for key, data in chunk_dict.iteritems():
 147             try:
 148                 xpath = self.path_to_xpath(key)
 149                 node = self.edoc.xpath(xpath)[0]
 150                 repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
 151                 node.getparent().replace(node, repl)
 152             except Exception, e:
 153                 # WTF xpath may be unused; also: too broad except
 154                 unmerged.append(repr((key, xpath, e)))
 155
 156         return unmerged
 157
 158     def clean_ed_note(self):
 159         """ deletes forbidden tags from nota_red """
 160
 161         forbidden_tags = ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw')
 162         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in forbidden_tags)):
 163             tail = node.tail
 164             node.clear()
 165             node.tag = 'span'
 166             node.tail = tail
 167
 168     # Converters
 169
 170     def as_html(self, *args, **kwargs):
 171         from librarian import pyhtml as html
 172         return html.transform(self, *args, **kwargs)
 173
 174     def as_weasy(self, *args, **kwargs):
 175         from librarian import weasy
 176         return weasy.transform(self, *args, **kwargs)
 177
 178     def as_text(self, *args, **kwargs):
 179         from librarian import text
 180         return text.transform(self, *args, **kwargs)
 181
 182     def as_epub(self, *args, **kwargs):
 183         from librarian import epub
 184         return epub.transform(self, *args, **kwargs)
 185
 186     def as_pdf(self, *args, **kwargs):
 187         from librarian import pypdf
 188         return pypdf.EduModulePDFFormat(self).build(*args, **kwargs)
 189
 190     def as_mobi(self, *args, **kwargs):
 191         from librarian import mobi
 192         return mobi.transform(self, *args, **kwargs)
 193
 194     def as_fb2(self, *args, **kwargs):
 195         from librarian import fb2
 196         return fb2.transform(self, *args, **kwargs)
 197
 198     def as_cover(self, cover_class=None, *args, **kwargs):
 199         if cover_class is None:
 200             from librarian.styles.wolnelektury.cover import WLCover
 201             cover_class = WLCover
 202         return cover_class(self.book_info, *args, **kwargs).output_file()
 203
 204     def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
 205         if output_dir_path:
 206             save_path = output_dir_path
 207             if make_author_dir:
 208                 save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8'))
 209             save_path = os.path.join(save_path, self.book_info.uri.slug)
 210             if ext:
 211                 save_path += '.%s' % ext
 212         else:
 213             save_path = output_path
 214
 215         output_file.save_as(save_path)