Don't validate <uwaga> contents
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from collections import Counter
9
10 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
14
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18
19 import os
20 import re
21 import six
22
23
24 from .elements import WL_ELEMENTS
25
26
27 class WLElementLookup(etree.CustomElementClassLookup):
28     def lookup(self, node_type, document, namespace, name):
29         if node_type != 'element':
30             return
31         if namespace:
32             return
33         try:
34             return WL_ELEMENTS[name]
35         except KeyError:
36             return
37
38
39 parser = etree.XMLParser()
40 parser.set_element_class_lookup(
41     WLElementLookup()
42 )
43
44
45
46 class WLDocument(object):
47     """Legacy class, to be replaced with documents.WLDocument."""
48     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
49     provider = None
50
51     def __init__(self, edoc, parse_dublincore=True, provider=None,
52                  strict=False, meta_fallbacks=None):
53         self.edoc = edoc
54         self.provider = provider
55
56         root_elem = edoc.getroot()
57
58         dc_path = './/' + RDFNS('RDF')
59
60         if root_elem.tag != 'utwor':
61             raise ValidationError(
62                 "Invalid root element. Found '%s', should be 'utwor'"
63                 % root_elem.tag
64             )
65
66         if parse_dublincore:
67             self.rdf_elem = root_elem.find(dc_path)
68
69             if self.rdf_elem is None:
70                 raise NoDublinCore(
71                     "Document must have a '%s' element." % RDFNS('RDF')
72                 )
73
74             self.book_info = dcparser.BookInfo.from_element(
75                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
76         else:
77             self.book_info = None
78
79     def get_statistics(self):
80         def count_text(text, counter, in_fn=False):
81             if text:
82                 text = re.sub(r'\s+', ' ', text)
83
84                 chars = len(text) if text.strip() else 0
85                 words = len(text.split()) if text.strip() else 0
86                 
87                 counter['chars'] += chars
88                 counter['words'] += words
89                 if not in_fn:
90                     counter['chars_with_fn'] += chars
91                     counter['words_with_fn'] += words
92                 
93         def count(elem, counter, in_fn=False):
94             if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
95                 return
96             if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
97                 in_fn = True
98             count_text(elem.text, counter, in_fn=in_fn)
99             for child in elem:
100                 count(child, counter, in_fn=in_fn)
101                 count_text(child.tail, counter, in_fn=in_fn)
102             
103             
104         data = {
105             "self": Counter(),
106             "parts": [],
107             "total": {
108             }
109         }
110
111         count(self.edoc.getroot(), data['self'])
112         for k, v in data['self'].items():
113             data['total'][k] = v
114         
115         for part in self.parts(pass_part_errors=True):
116             if isinstance(part, Exception):
117                 data['parts'].append((None, {}))
118             else:
119                 data['parts'].append((part, part.get_statistics()))
120                 for k, v in data['parts'][-1][1]['total'].items():
121                     data['total'][k] = data['total'].get(k, 0) + v
122             
123         return data
124
125     @classmethod
126     def from_bytes(cls, xml, *args, **kwargs):
127         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
128
129     @classmethod
130     def from_file(cls, xmlfile, *args, **kwargs):
131
132         # first, prepare for parsing
133         if isinstance(xmlfile, six.text_type):
134             file = open(xmlfile, 'rb')
135             try:
136                 data = file.read()
137             finally:
138                 file.close()
139         else:
140             data = xmlfile.read()
141
142         if not isinstance(data, six.text_type):
143             data = data.decode('utf-8')
144
145         data = data.replace(u'\ufeff', '')
146
147         try:
148             parser = etree.XMLParser(remove_blank_text=False)
149             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
150
151             return cls(tree, *args, **kwargs)
152         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
153             raise ParseError(e)
154
155     def swap_endlines(self):
156         """Converts line breaks in stanzas into <br/> tags."""
157         # only swap inside stanzas
158         for elem in self.edoc.iter('strofa'):
159             for child in list(elem):
160                 if child.tail:
161                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
162                     ins_index = elem.index(child) + 1
163                     while len(chunks) > 1:
164                         ins = etree.Element('br')
165                         ins.tail = chunks.pop()
166                         elem.insert(ins_index, ins)
167                     child.tail = chunks.pop(0)
168             if elem.text:
169                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
170                 while len(chunks) > 1:
171                     ins = etree.Element('br')
172                     ins.tail = chunks.pop()
173                     elem.insert(0, ins)
174                 elem.text = chunks.pop(0)
175
176     def parts(self, pass_part_errors=False):
177         if self.provider is None:
178             raise NoProvider('No document provider supplied.')
179         if self.book_info is None:
180             raise NoDublinCore('No Dublin Core in document.')
181         for part_uri in self.book_info.parts:
182             try:
183                 yield self.from_file(
184                     self.provider.by_slug(part_uri.slug), provider=self.provider
185                 )
186             except Exception as e:
187                 if pass_part_errors:
188                     yield e
189                 else:
190                     raise
191
192     def chunk(self, path):
193         # convert the path to XPath
194         expr = self.path_to_xpath(path)
195         elems = self.edoc.xpath(expr)
196
197         if len(elems) == 0:
198             return None
199         else:
200             return elems[0]
201
202     def path_to_xpath(self, path):
203         parts = []
204
205         for part in path.split('/'):
206             match = re.match(r'([^\[]+)\[(\d+)\]', part)
207             if not match:
208                 parts.append(part)
209             else:
210                 tag, n = match.groups()
211                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
212
213         if parts[0] == '.':
214             parts[0] = ''
215
216         return '/'.join(parts)
217
218     def transform(self, stylesheet, **options):
219         return self.edoc.xslt(stylesheet, **options)
220
221     def update_dc(self):
222         if self.book_info:
223             parent = self.rdf_elem.getparent()
224             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
225
226     def serialize(self):
227         self.update_dc()
228         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
229
230     def merge_chunks(self, chunk_dict):
231         unmerged = []
232
233         for key, data in chunk_dict.iteritems():
234             try:
235                 xpath = self.path_to_xpath(key)
236                 node = self.edoc.xpath(xpath)[0]
237                 repl = etree.fromstring(
238                     "<%s>%s</%s>" % (node.tag, data, node.tag)
239                 )
240                 node.getparent().replace(node, repl)
241             except Exception as e:
242                 unmerged.append(repr((key, xpath, e)))
243
244         return unmerged
245
246     def clean_ed_note(self, note_tag='nota_red'):
247         """ deletes forbidden tags from nota_red """
248
249         for node in self.edoc.xpath('|'.join(
250                 '//%s//%s' % (note_tag, tag) for tag in
251                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
252             tail = node.tail
253             node.clear()
254             node.tag = 'span'
255             node.tail = tail
256
257     def fix_pa_akap(self):
258         for pa in ('pa','pe','pr','pt'):
259             for akap in self.edoc.findall(f'//{pa}/akap'):
260                 akap.getparent().set('blocks', 'true')
261                 if not akap.getparent().index(akap):
262                     akap.set('inline', 'true')
263             
264     def editors(self):
265         """Returns a set of all editors for book and its children.
266
267         :returns: set of dcparser.Person objects
268         """
269         if self.book_info is None:
270             raise NoDublinCore('No Dublin Core in document.')
271         persons = set(self.book_info.editors
272                       + self.book_info.technical_editors)
273         for child in self.parts():
274             persons.update(child.editors())
275         if None in persons:
276             persons.remove(None)
277         return persons
278
279     # Converters
280
281     def as_html(self, *args, **kwargs):
282         from librarian import html
283         return html.transform(self, *args, **kwargs)
284
285     def as_text(self, *args, **kwargs):
286         from librarian import text
287         return text.transform(self, *args, **kwargs)
288
289     def as_epub(self, *args, **kwargs):
290         from librarian import epub
291         return epub.transform(self, *args, **kwargs)
292
293     def as_pdf(self, *args, **kwargs):
294         from librarian import pdf
295         return pdf.transform(self, *args, **kwargs)
296
297     def as_mobi(self, *args, **kwargs):
298         from librarian import mobi
299         return mobi.transform(self, *args, **kwargs)
300
301     def as_fb2(self, *args, **kwargs):
302         from librarian import fb2
303         return fb2.transform(self, *args, **kwargs)
304
305     def as_cover(self, cover_class=None, *args, **kwargs):
306         if cover_class is None:
307             cover_class = make_cover
308         return cover_class(self.book_info, *args, **kwargs).output_file()
309
310     # for debugging only
311     def latex_dir(self, *args, **kwargs):
312         kwargs['latex_dir'] = True
313         from librarian import pdf
314         return pdf.transform(self, *args, **kwargs)
315
316     def save_output_file(self, output_file, output_path=None,
317                          output_dir_path=None, make_author_dir=False,
318                          ext=None):
319         if output_dir_path:
320             save_path = output_dir_path
321             if make_author_dir:
322                 save_path = os.path.join(
323                     save_path,
324                     six.text_type(self.book_info.author).encode('utf-8')
325                 )
326             save_path = os.path.join(save_path, self.book_info.url.slug)
327             if ext:
328                 save_path += '.%s' % ext
329         else:
330             save_path = output_path
331
332         output_file.save_as(save_path)