Fixes in the experimental converters.
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
9 from librarian import RDFNS
10 from librarian.cover import make_cover
11 from librarian import dcparser
12
13 from xml.parsers.expat import ExpatError
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
16
17 import os
18 import re
19 import six
20
21
22 from .elements import WL_ELEMENTS
23
24
25 class WLElementLookup(etree.CustomElementClassLookup):
26     def lookup(self, node_type, document, namespace, name):
27         if node_type != 'element':
28             return
29         if namespace:
30             return
31         return WL_ELEMENTS[name]
32
33
34 parser = etree.XMLParser()
35 parser.set_element_class_lookup(
36     WLElementLookup()
37 )
38
39
40
41 class WLDocument(object):
42     """Legacy class, to be replaced with documents.WLDocument."""
43     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
44     provider = None
45
46     def __init__(self, edoc, parse_dublincore=True, provider=None,
47                  strict=False, meta_fallbacks=None):
48         self.edoc = edoc
49         self.provider = provider
50
51         root_elem = edoc.getroot()
52
53         dc_path = './/' + RDFNS('RDF')
54
55         if root_elem.tag != 'utwor':
56             raise ValidationError(
57                 "Invalid root element. Found '%s', should be 'utwor'"
58                 % root_elem.tag
59             )
60
61         if parse_dublincore:
62             self.rdf_elem = root_elem.find(dc_path)
63
64             if self.rdf_elem is None:
65                 raise NoDublinCore(
66                     "Document must have a '%s' element." % RDFNS('RDF')
67                 )
68
69             self.book_info = dcparser.BookInfo.from_element(
70                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
71         else:
72             self.book_info = None
73
74     @classmethod
75     def from_bytes(cls, xml, *args, **kwargs):
76         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
77
78     @classmethod
79     def from_file(cls, xmlfile, *args, **kwargs):
80
81         # first, prepare for parsing
82         if isinstance(xmlfile, six.text_type):
83             file = open(xmlfile, 'rb')
84             try:
85                 data = file.read()
86             finally:
87                 file.close()
88         else:
89             data = xmlfile.read()
90
91         if not isinstance(data, six.text_type):
92             data = data.decode('utf-8')
93
94         data = data.replace(u'\ufeff', '')
95
96         try:
97             parser = etree.XMLParser(remove_blank_text=False)
98             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
99
100             return cls(tree, *args, **kwargs)
101         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
102             raise ParseError(e)
103
104     def swap_endlines(self):
105         """Converts line breaks in stanzas into <br/> tags."""
106         # only swap inside stanzas
107         for elem in self.edoc.iter('strofa'):
108             for child in list(elem):
109                 if child.tail:
110                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
111                     ins_index = elem.index(child) + 1
112                     while len(chunks) > 1:
113                         ins = etree.Element('br')
114                         ins.tail = chunks.pop()
115                         elem.insert(ins_index, ins)
116                     child.tail = chunks.pop(0)
117             if elem.text:
118                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
119                 while len(chunks) > 1:
120                     ins = etree.Element('br')
121                     ins.tail = chunks.pop()
122                     elem.insert(0, ins)
123                 elem.text = chunks.pop(0)
124
125     def parts(self):
126         if self.provider is None:
127             raise NoProvider('No document provider supplied.')
128         if self.book_info is None:
129             raise NoDublinCore('No Dublin Core in document.')
130         for part_uri in self.book_info.parts:
131             yield self.from_file(
132                 self.provider.by_uri(part_uri), provider=self.provider
133             )
134
135     def chunk(self, path):
136         # convert the path to XPath
137         expr = self.path_to_xpath(path)
138         elems = self.edoc.xpath(expr)
139
140         if len(elems) == 0:
141             return None
142         else:
143             return elems[0]
144
145     def path_to_xpath(self, path):
146         parts = []
147
148         for part in path.split('/'):
149             match = re.match(r'([^\[]+)\[(\d+)\]', part)
150             if not match:
151                 parts.append(part)
152             else:
153                 tag, n = match.groups()
154                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
155
156         if parts[0] == '.':
157             parts[0] = ''
158
159         return '/'.join(parts)
160
161     def transform(self, stylesheet, **options):
162         return self.edoc.xslt(stylesheet, **options)
163
164     def update_dc(self):
165         if self.book_info:
166             parent = self.rdf_elem.getparent()
167             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
168
169     def serialize(self):
170         self.update_dc()
171         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
172
173     def merge_chunks(self, chunk_dict):
174         unmerged = []
175
176         for key, data in chunk_dict.iteritems():
177             try:
178                 xpath = self.path_to_xpath(key)
179                 node = self.edoc.xpath(xpath)[0]
180                 repl = etree.fromstring(
181                     "<%s>%s</%s>" % (node.tag, data, node.tag)
182                 )
183                 node.getparent().replace(node, repl)
184             except Exception as e:
185                 unmerged.append(repr((key, xpath, e)))
186
187         return unmerged
188
189     def clean_ed_note(self, note_tag='nota_red'):
190         """ deletes forbidden tags from nota_red """
191
192         for node in self.edoc.xpath('|'.join(
193                 '//%s//%s' % (note_tag, tag) for tag in
194                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
195             tail = node.tail
196             node.clear()
197             node.tag = 'span'
198             node.tail = tail
199
200     def editors(self):
201         """Returns a set of all editors for book and its children.
202
203         :returns: set of dcparser.Person objects
204         """
205         if self.book_info is None:
206             raise NoDublinCore('No Dublin Core in document.')
207         persons = set(self.book_info.editors
208                       + self.book_info.technical_editors)
209         for child in self.parts():
210             persons.update(child.editors())
211         if None in persons:
212             persons.remove(None)
213         return persons
214
215     # Converters
216
217     def as_html(self, *args, **kwargs):
218         from librarian import html
219         return html.transform(self, *args, **kwargs)
220
221     def as_text(self, *args, **kwargs):
222         from librarian import text
223         return text.transform(self, *args, **kwargs)
224
225     def as_epub(self, *args, **kwargs):
226         from librarian import epub
227         return epub.transform(self, *args, **kwargs)
228
229     def as_pdf(self, *args, **kwargs):
230         from librarian import pdf
231         return pdf.transform(self, *args, **kwargs)
232
233     def as_mobi(self, *args, **kwargs):
234         from librarian import mobi
235         return mobi.transform(self, *args, **kwargs)
236
237     def as_fb2(self, *args, **kwargs):
238         from librarian import fb2
239         return fb2.transform(self, *args, **kwargs)
240
241     def as_cover(self, cover_class=None, *args, **kwargs):
242         if cover_class is None:
243             cover_class = make_cover
244         return cover_class(self.book_info, *args, **kwargs).output_file()
245
246     # for debugging only
247     def latex_dir(self, *args, **kwargs):
248         kwargs['latex_dir'] = True
249         from librarian import pdf
250         return pdf.transform(self, *args, **kwargs)
251
252     def save_output_file(self, output_file, output_path=None,
253                          output_dir_path=None, make_author_dir=False,
254                          ext=None):
255         if output_dir_path:
256             save_path = output_dir_path
257             if make_author_dir:
258                 save_path = os.path.join(
259                     save_path,
260                     six.text_type(self.book_info.author).encode('utf-8')
261                 )
262             save_path = os.path.join(save_path, self.book_info.url.slug)
263             if ext:
264                 save_path += '.%s' % ext
265         else:
266             save_path = output_path
267
268         output_file.save_as(save_path)