src/librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import unicode_literals
   7
   8 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
   9 from librarian import RDFNS
  10 from librarian.cover import make_cover
  11 from librarian import dcparser
  12
  13 from xml.parsers.expat import ExpatError
  14 from lxml import etree
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 import os
  18 import re
  19 import six
  20
  21
  22 class WLDocument(object):
  23     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
  24     provider = None
  25
  26     def __init__(self, edoc, parse_dublincore=True, provider=None,
  27                  strict=False, meta_fallbacks=None):
  28         self.edoc = edoc
  29         self.provider = provider
  30
  31         root_elem = edoc.getroot()
  32
  33         dc_path = './/' + RDFNS('RDF')
  34
  35         if root_elem.tag != 'utwor':
  36             raise ValidationError(
  37                 "Invalid root element. Found '%s', should be 'utwor'"
  38                 % root_elem.tag
  39             )
  40
  41         if parse_dublincore:
  42             self.rdf_elem = root_elem.find(dc_path)
  43
  44             if self.rdf_elem is None:
  45                 raise NoDublinCore(
  46                     "Document must have a '%s' element." % RDFNS('RDF')
  47                 )
  48
  49             self.book_info = dcparser.BookInfo.from_element(
  50                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
  51         else:
  52             self.book_info = None
  53
  54     @classmethod
  55     def from_bytes(cls, xml, *args, **kwargs):
  56         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
  57
  58     @classmethod
  59     def from_file(cls, xmlfile, *args, **kwargs):
  60
  61         # first, prepare for parsing
  62         if isinstance(xmlfile, six.text_type):
  63             file = open(xmlfile, 'rb')
  64             try:
  65                 data = file.read()
  66             finally:
  67                 file.close()
  68         else:
  69             data = xmlfile.read()
  70
  71         if not isinstance(data, six.text_type):
  72             data = data.decode('utf-8')
  73
  74         data = data.replace(u'\ufeff', '')
  75
  76         try:
  77             parser = etree.XMLParser(remove_blank_text=False)
  78             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
  79
  80             return cls(tree, *args, **kwargs)
  81         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
  82             raise ParseError(e)
  83
  84     def swap_endlines(self):
  85         """Converts line breaks in stanzas into <br/> tags."""
  86         # only swap inside stanzas
  87         for elem in self.edoc.iter('strofa'):
  88             for child in list(elem):
  89                 if child.tail:
  90                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
  91                     ins_index = elem.index(child) + 1
  92                     while len(chunks) > 1:
  93                         ins = etree.Element('br')
  94                         ins.tail = chunks.pop()
  95                         elem.insert(ins_index, ins)
  96                     child.tail = chunks.pop(0)
  97             if elem.text:
  98                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
  99                 while len(chunks) > 1:
 100                     ins = etree.Element('br')
 101                     ins.tail = chunks.pop()
 102                     elem.insert(0, ins)
 103                 elem.text = chunks.pop(0)
 104
 105     def parts(self):
 106         if self.provider is None:
 107             raise NoProvider('No document provider supplied.')
 108         if self.book_info is None:
 109             raise NoDublinCore('No Dublin Core in document.')
 110         for part_uri in self.book_info.parts:
 111             yield self.from_file(
 112                 self.provider.by_uri(part_uri), provider=self.provider
 113             )
 114
 115     def chunk(self, path):
 116         # convert the path to XPath
 117         expr = self.path_to_xpath(path)
 118         elems = self.edoc.xpath(expr)
 119
 120         if len(elems) == 0:
 121             return None
 122         else:
 123             return elems[0]
 124
 125     def path_to_xpath(self, path):
 126         parts = []
 127
 128         for part in path.split('/'):
 129             match = re.match(r'([^\[]+)\[(\d+)\]', part)
 130             if not match:
 131                 parts.append(part)
 132             else:
 133                 tag, n = match.groups()
 134                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
 135
 136         if parts[0] == '.':
 137             parts[0] = ''
 138
 139         return '/'.join(parts)
 140
 141     def transform(self, stylesheet, **options):
 142         return self.edoc.xslt(stylesheet, **options)
 143
 144     def update_dc(self):
 145         if self.book_info:
 146             parent = self.rdf_elem.getparent()
 147             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
 148
 149     def serialize(self):
 150         self.update_dc()
 151         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
 152
 153     def merge_chunks(self, chunk_dict):
 154         unmerged = []
 155
 156         for key, data in chunk_dict.iteritems():
 157             try:
 158                 xpath = self.path_to_xpath(key)
 159                 node = self.edoc.xpath(xpath)[0]
 160                 repl = etree.fromstring(
 161                     "<%s>%s</%s>" % (node.tag, data, node.tag)
 162                 )
 163                 node.getparent().replace(node, repl)
 164             except Exception as e:
 165                 unmerged.append(repr((key, xpath, e)))
 166
 167         return unmerged
 168
 169     def clean_ed_note(self, note_tag='nota_red'):
 170         """ deletes forbidden tags from nota_red """
 171
 172         for node in self.edoc.xpath('|'.join(
 173                 '//%s//%s' % (note_tag, tag) for tag in
 174                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
 175             tail = node.tail
 176             node.clear()
 177             node.tag = 'span'
 178             node.tail = tail
 179
 180     def editors(self):
 181         """Returns a set of all editors for book and its children.
 182
 183         :returns: set of dcparser.Person objects
 184         """
 185         if self.book_info is None:
 186             raise NoDublinCore('No Dublin Core in document.')
 187         persons = set(self.book_info.editors
 188                       + self.book_info.technical_editors)
 189         for child in self.parts():
 190             persons.update(child.editors())
 191         if None in persons:
 192             persons.remove(None)
 193         return persons
 194
 195     # Converters
 196
 197     def as_html(self, *args, **kwargs):
 198         from librarian import html
 199         return html.transform(self, *args, **kwargs)
 200
 201     def as_text(self, *args, **kwargs):
 202         from librarian import text
 203         return text.transform(self, *args, **kwargs)
 204
 205     def as_epub(self, *args, **kwargs):
 206         from librarian import epub
 207         return epub.transform(self, *args, **kwargs)
 208
 209     def as_pdf(self, *args, **kwargs):
 210         from librarian import pdf
 211         return pdf.transform(self, *args, **kwargs)
 212
 213     def as_mobi(self, *args, **kwargs):
 214         from librarian import mobi
 215         return mobi.transform(self, *args, **kwargs)
 216
 217     def as_fb2(self, *args, **kwargs):
 218         from librarian import fb2
 219         return fb2.transform(self, *args, **kwargs)
 220
 221     def as_cover(self, cover_class=None, *args, **kwargs):
 222         if cover_class is None:
 223             cover_class = make_cover
 224         return cover_class(self.book_info, *args, **kwargs).output_file()
 225
 226     # for debugging only
 227     def latex_dir(self, *args, **kwargs):
 228         kwargs['latex_dir'] = True
 229         from librarian import pdf
 230         return pdf.transform(self, *args, **kwargs)
 231
 232     def save_output_file(self, output_file, output_path=None,
 233                          output_dir_path=None, make_author_dir=False,
 234                          ext=None):
 235         if output_dir_path:
 236             save_path = output_dir_path
 237             if make_author_dir:
 238                 save_path = os.path.join(
 239                     save_path,
 240                     six.text_type(self.book_info.author).encode('utf-8')
 241                 )
 242             save_path = os.path.join(save_path, self.book_info.url.slug)
 243             if ext:
 244                 save_path += '.%s' % ext
 245         else:
 246             save_path = output_path
 247
 248         output_file.save_as(save_path)