librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
   7 from librarian import RDFNS, IOFile
   8 from librarian.styles.wolnelektury.cover import WLCover
   9 from librarian import dcparser
  10
  11 from xml.parsers.expat import ExpatError
  12 from lxml import etree
  13 from lxml.etree import XMLSyntaxError, XSLTApplyError
  14
  15 import os
  16 import re
  17 from StringIO import StringIO
  18
  19 class WLDocument(object):
  20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
  21     provider = None
  22
  23     _edoc = None
  24     @property
  25     def edoc(self):
  26         if self._edoc is None:
  27             data = self.source.get_string()
  28             if not isinstance(data, unicode):
  29                 data = data.decode('utf-8')
  30             data = data.replace(u'\ufeff', '')
  31             try:
  32                 parser = etree.XMLParser(remove_blank_text=False)
  33                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
  34             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  35                 raise ParseError(e)
  36         return self._edoc
  37
  38     _rdf_elem = None
  39     @property
  40     def rdf_elem(self):
  41         if self._rdf_elem is None:
  42             dc_path = './/' + RDFNS('RDF')
  43             self._rdf_elem = self.edoc.getroot().find(dc_path)
  44             if self._rdf_elem is None:
  45                 raise NoDublinCore('Document has no DublinCore - which is required.')
  46         return self._rdf_elem
  47
  48     _book_info = None
  49     @property
  50     def book_info(self):
  51         if not self.parse_dublincore:
  52             return None
  53         if self._book_info is None:
  54             self._book_info = dcparser.BookInfo.from_element(
  55                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
  56         return self._book_info
  57
  58     def __init__(self, iofile, provider=None,
  59             parse_dublincore=True, # shouldn't it be in a subclass?
  60             strict=False, # ?
  61             meta_fallbacks=None # ?
  62             ):
  63         self.source = iofile
  64         self.provider = provider
  65         self.parse_dublincore = parse_dublincore
  66         self.strict = strict
  67         self.meta_fallbacks = meta_fallbacks
  68         if self.edoc.getroot().tag != 'utwor':
  69             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  70         if parse_dublincore:
  71             self.book_info
  72
  73     @classmethod
  74     def from_string(cls, xml, *args, **kwargs):
  75         return cls(IOFile.from_string(xml), *args, **kwargs)
  76
  77     @classmethod
  78     def from_file(cls, xmlfile, *args, **kwargs):
  79         if isinstance(xmlfile, basestring):
  80             iofile = IOFile.from_filename(xmlfile)
  81         else:
  82             iofile = IOFile.from_file(xmlfile)
  83         return cls(iofile, *args, **kwargs)
  84
  85
  86     def swap_endlines(self):
  87         """Converts line breaks in stanzas into <br/> tags."""
  88         # only swap inside stanzas
  89         for elem in self.edoc.iter('strofa'):
  90             for child in list(elem):
  91                 if child.tail:
  92                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
  93                     ins_index = elem.index(child) + 1
  94                     while len(chunks) > 1:
  95                         ins = etree.Element('br')
  96                         ins.tail = chunks.pop()
  97                         elem.insert(ins_index, ins)
  98                     child.tail = chunks.pop(0)
  99             if elem.text:
 100                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
 101                 while len(chunks) > 1:
 102                     ins = etree.Element('br')
 103                     ins.tail = chunks.pop()
 104                     elem.insert(0, ins)
 105                 elem.text = chunks.pop(0)
 106
 107     def parts(self):
 108         if self.book_info is None:
 109             raise NoDublinCore('No Dublin Core in document.')
 110         if self.book_info.parts and self.provider is None:
 111             raise NoProvider('No document provider supplied.')
 112         for part_uri in self.book_info.parts:
 113             yield self.from_file(self.provider.by_uri(part_uri),
 114                     provider=self.provider)
 115
 116     def chunk(self, path):
 117         # convert the path to XPath
 118         expr = self.path_to_xpath(path)
 119         elems = self.edoc.xpath(expr)
 120
 121         if len(elems) == 0:
 122             return None
 123         else:
 124             return elems[0]
 125
 126     def path_to_xpath(self, path):
 127         parts = []
 128
 129         for part in path.split('/'):
 130             match = re.match(r'([^\[]+)\[(\d+)\]', part)
 131             if not match:
 132                 parts.append(part)
 133             else:
 134                 tag, n = match.groups()
 135                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
 136
 137         if parts[0] == '.':
 138             parts[0] = ''
 139
 140         return '/'.join(parts)
 141
 142     def transform(self, stylesheet, **options):
 143         return self.edoc.xslt(stylesheet, **options)
 144
 145     def update_dc(self):
 146         if self.book_info:
 147             parent = self.rdf_elem.getparent()
 148             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
 149
 150     def serialize(self):
 151         self.update_dc()
 152         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 153
 154     def merge_chunks(self, chunk_dict):
 155         unmerged = []
 156
 157         for key, data in chunk_dict.iteritems():
 158             try:
 159                 xpath = self.path_to_xpath(key)
 160                 node = self.edoc.xpath(xpath)[0]
 161                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
 162                 node.getparent().replace(node, repl)
 163             except Exception, e:
 164                 unmerged.append( repr( (key, xpath, e) ) )
 165
 166         return unmerged
 167
 168     def clean_ed_note(self):
 169         """ deletes forbidden tags from nota_red """
 170
 171         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
 172                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
 173             tail = node.tail
 174             node.clear()
 175             node.tag = 'span'
 176             node.tail = tail
 177
 178     def editors(self):
 179         """Returns a set of all editors for book and its children.
 180
 181         :returns: set of dcparser.Person objects
 182         """
 183         if self.book_info is None:
 184             raise NoDublinCore('No Dublin Core in document.')
 185         persons = set(self.book_info.editors +
 186                         self.book_info.technical_editors)
 187         for child in self.parts():
 188             persons.update(child.editors())
 189         if None in persons:
 190             persons.remove(None)
 191         return persons
 192
 193     # Converters
 194
 195     def as_html(self, *args, **kwargs):
 196         from librarian import pyhtml as html
 197         return html.transform(self, *args, **kwargs)
 198
 199     def as_text(self, *args, **kwargs):
 200         from librarian import text
 201         return text.transform(self, *args, **kwargs)
 202
 203     def as_epub(self, *args, **kwargs):
 204         from librarian import epub
 205         return epub.transform(self, *args, **kwargs)
 206
 207     def as_pdf(self, *args, **kwargs):
 208         from librarian import pdf
 209         return pdf.transform(self, *args, **kwargs)
 210
 211     def as_mobi(self, *args, **kwargs):
 212         from librarian import mobi
 213         return mobi.transform(self, *args, **kwargs)
 214
 215     def as_fb2(self, *args, **kwargs):
 216         from librarian import fb2
 217         return fb2.transform(self, *args, **kwargs)
 218
 219     def as_cover(self, cover_class=None, *args, **kwargs):
 220         if cover_class is None:
 221             cover_class = WLCover
 222         return cover_class(self.book_info, *args, **kwargs).output_file()
 223
 224     def save_output_file(self, output_file, output_path=None,
 225             output_dir_path=None, make_author_dir=False, ext=None):
 226         if output_dir_path:
 227             save_path = output_dir_path
 228             if make_author_dir:
 229                 save_path = os.path.join(save_path,
 230                         unicode(self.book_info.author).encode('utf-8'))
 231             save_path = os.path.join(save_path,
 232                                 self.book_info.uri.slug)
 233             if ext:
 234                 save_path += '.%s' % ext
 235         else:
 236             save_path = output_path
 237
 238         output_file.save_as(save_path)