6e21d4deed6c1882d7a6e73d6c1be70ee099fd14
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
9 from librarian import RDFNS
10 from librarian.cover import make_cover
11 from librarian import dcparser
12
13 from xml.parsers.expat import ExpatError
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
16
17 import os
18 import re
19 import six
20
21
22 from .elements import WL_ELEMENTS
23
24
25 class WLElementLookup(etree.CustomElementClassLookup):
26     def lookup(self, node_type, document, namespace, name):
27         if node_type != 'element':
28             return
29         if namespace:
30             return
31         return WL_ELEMENTS[name]
32
33
34 parser = etree.XMLParser()
35 parser.set_element_class_lookup(
36     WLElementLookup()
37 )
38
39
40
41 class WLDocument(object):
42     """Legacy class, to be replaced with documents.WLDocument."""
43     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
44     provider = None
45
46     def __init__(self, edoc, parse_dublincore=True, provider=None,
47                  strict=False, meta_fallbacks=None):
48         self.edoc = edoc
49         self.provider = provider
50
51         root_elem = edoc.getroot()
52
53         dc_path = './/' + RDFNS('RDF')
54
55         if root_elem.tag != 'utwor':
56             raise ValidationError(
57                 "Invalid root element. Found '%s', should be 'utwor'"
58                 % root_elem.tag
59             )
60
61         if parse_dublincore:
62             self.rdf_elem = root_elem.find(dc_path)
63
64             if self.rdf_elem is None:
65                 raise NoDublinCore(
66                     "Document must have a '%s' element." % RDFNS('RDF')
67                 )
68
69             self.book_info = dcparser.BookInfo.from_element(
70                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
71         else:
72             self.book_info = None
73
74     @classmethod
75     def from_bytes(cls, xml, *args, **kwargs):
76         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
77
78     @classmethod
79     def from_file(cls, xmlfile, *args, **kwargs):
80
81         # first, prepare for parsing
82         if isinstance(xmlfile, six.text_type):
83             file = open(xmlfile, 'rb')
84             try:
85                 data = file.read()
86             finally:
87                 file.close()
88         else:
89             data = xmlfile.read()
90
91         if not isinstance(data, six.text_type):
92             data = data.decode('utf-8')
93
94         data = data.replace(u'\ufeff', '')
95
96         try:
97             parser = etree.XMLParser(remove_blank_text=False)
98             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
99
100             return cls(tree, *args, **kwargs)
101         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
102             raise ParseError(e)
103
104     def swap_endlines(self):
105         """Converts line breaks in stanzas into <br/> tags."""
106         # only swap inside stanzas
107         for elem in self.edoc.iter('strofa'):
108             for child in list(elem):
109                 if child.tail:
110                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
111                     ins_index = elem.index(child) + 1
112                     while len(chunks) > 1:
113                         ins = etree.Element('br')
114                         ins.tail = chunks.pop()
115                         elem.insert(ins_index, ins)
116                     child.tail = chunks.pop(0)
117             if elem.text:
118                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
119                 while len(chunks) > 1:
120                     ins = etree.Element('br')
121                     ins.tail = chunks.pop()
122                     elem.insert(0, ins)
123                 elem.text = chunks.pop(0)
124
125     def parts(self):
126         if self.provider is None:
127             raise NoProvider('No document provider supplied.')
128         if self.book_info is None:
129             raise NoDublinCore('No Dublin Core in document.')
130         for part_uri in self.book_info.parts:
131             yield self.from_file(
132                 self.provider.by_uri(part_uri), provider=self.provider
133             )
134
135     def chunk(self, path):
136         # convert the path to XPath
137         expr = self.path_to_xpath(path)
138         elems = self.edoc.xpath(expr)
139
140         if len(elems) == 0:
141             return None
142         else:
143             return elems[0]
144
145     def path_to_xpath(self, path):
146         parts = []
147
148         for part in path.split('/'):
149             match = re.match(r'([^\[]+)\[(\d+)\]', part)
150             if not match:
151                 parts.append(part)
152             else:
153                 tag, n = match.groups()
154                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
155
156         if parts[0] == '.':
157             parts[0] = ''
158
159         return '/'.join(parts)
160
161     def transform(self, stylesheet, **options):
162         return self.edoc.xslt(stylesheet, **options)
163
164     def update_dc(self):
165         if self.book_info:
166             parent = self.rdf_elem.getparent()
167             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
168
169     def serialize(self):
170         self.update_dc()
171         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
172
173     def merge_chunks(self, chunk_dict):
174         unmerged = []
175
176         for key, data in chunk_dict.iteritems():
177             try:
178                 xpath = self.path_to_xpath(key)
179                 node = self.edoc.xpath(xpath)[0]
180                 repl = etree.fromstring(
181                     "<%s>%s</%s>" % (node.tag, data, node.tag)
182                 )
183                 node.getparent().replace(node, repl)
184             except Exception as e:
185                 unmerged.append(repr((key, xpath, e)))
186
187         return unmerged
188
189     def clean_ed_note(self, note_tag='nota_red'):
190         """ deletes forbidden tags from nota_red """
191
192         for node in self.edoc.xpath('|'.join(
193                 '//%s//%s' % (note_tag, tag) for tag in
194                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
195             tail = node.tail
196             node.clear()
197             node.tag = 'span'
198             node.tail = tail
199
200     def fix_pa_akap(self):
201         for pa in ('pa','pe','pr','pt'):
202             for akap in self.edoc.findall(f'//{pa}/akap'):
203                 akap.getparent().set('blocks', 'true')
204                 if not akap.getparent().index(akap):
205                     akap.set('inline', 'true')
206             
207     def editors(self):
208         """Returns a set of all editors for book and its children.
209
210         :returns: set of dcparser.Person objects
211         """
212         if self.book_info is None:
213             raise NoDublinCore('No Dublin Core in document.')
214         persons = set(self.book_info.editors
215                       + self.book_info.technical_editors)
216         for child in self.parts():
217             persons.update(child.editors())
218         if None in persons:
219             persons.remove(None)
220         return persons
221
222     # Converters
223
224     def as_html(self, *args, **kwargs):
225         from librarian import html
226         return html.transform(self, *args, **kwargs)
227
228     def as_text(self, *args, **kwargs):
229         from librarian import text
230         return text.transform(self, *args, **kwargs)
231
232     def as_epub(self, *args, **kwargs):
233         from librarian import epub
234         return epub.transform(self, *args, **kwargs)
235
236     def as_pdf(self, *args, **kwargs):
237         from librarian import pdf
238         return pdf.transform(self, *args, **kwargs)
239
240     def as_mobi(self, *args, **kwargs):
241         from librarian import mobi
242         return mobi.transform(self, *args, **kwargs)
243
244     def as_fb2(self, *args, **kwargs):
245         from librarian import fb2
246         return fb2.transform(self, *args, **kwargs)
247
248     def as_cover(self, cover_class=None, *args, **kwargs):
249         if cover_class is None:
250             cover_class = make_cover
251         return cover_class(self.book_info, *args, **kwargs).output_file()
252
253     # for debugging only
254     def latex_dir(self, *args, **kwargs):
255         kwargs['latex_dir'] = True
256         from librarian import pdf
257         return pdf.transform(self, *args, **kwargs)
258
259     def save_output_file(self, output_file, output_path=None,
260                          output_dir_path=None, make_author_dir=False,
261                          ext=None):
262         if output_dir_path:
263             save_path = output_dir_path
264             if make_author_dir:
265                 save_path = os.path.join(
266                     save_path,
267                     six.text_type(self.book_info.author).encode('utf-8')
268                 )
269             save_path = os.path.join(save_path, self.book_info.url.slug)
270             if ext:
271                 save_path += '.%s' % ext
272         else:
273             save_path = output_path
274
275         output_file.save_as(save_path)