Verse counters.
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from collections import Counter
9
10 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
14
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18
19 import os
20 import re
21 import six
22
23
24 from .elements import WL_ELEMENTS
25
26
27 class WLElementLookup(etree.CustomElementClassLookup):
28     def lookup(self, node_type, document, namespace, name):
29         if node_type != 'element':
30             return
31         if namespace:
32             return
33         try:
34             return WL_ELEMENTS[name]
35         except KeyError:
36             return
37
38
39 parser = etree.XMLParser()
40 parser.set_element_class_lookup(
41     WLElementLookup()
42 )
43
44
45
46 class WLDocument(object):
47     """Legacy class, to be replaced with documents.WLDocument."""
48     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
49     provider = None
50
51     def __init__(self, edoc, parse_dublincore=True, provider=None,
52                  strict=False, meta_fallbacks=None):
53         self.edoc = edoc
54         self.provider = provider
55
56         root_elem = edoc.getroot()
57
58         dc_path = './/' + RDFNS('RDF')
59
60         if root_elem.tag != 'utwor':
61             raise ValidationError(
62                 "Invalid root element. Found '%s', should be 'utwor'"
63                 % root_elem.tag
64             )
65
66         if parse_dublincore:
67             self.rdf_elem = root_elem.find(dc_path)
68
69             if self.rdf_elem is None:
70                 raise NoDublinCore(
71                     "Document must have a '%s' element." % RDFNS('RDF')
72                 )
73
74             self.book_info = dcparser.BookInfo.from_element(
75                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
76         else:
77             self.book_info = None
78
79     def get_statistics(self):
80         def count_text(text, counter, in_fn=False, stanza=False):
81             if text:
82                 text = re.sub(r'\s+', ' ', text)
83
84                 chars = len(text) if text.strip() else 0
85                 words = len(text.split()) if text.strip() else 0
86                 
87                 counter['chars_with_fn'] += chars
88                 counter['words_with_fn'] += words
89                 if not in_fn:
90                     counter['chars'] += chars
91                     counter['words'] += words
92                 if not stanza:
93                     counter['chars_out_verse_with_fn'] += chars
94                     if not in_fn:
95                         counter['chars_out_verse'] += chars
96                 
97         def count(elem, counter, in_fn=False, stanza=False):
98             if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
99                 return
100             if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
101                 in_fn = True
102             if elem.tag == 'strofa':
103                 # count verses now
104                 verses = len(elem.findall('.//br')) + 1
105                 counter['verses_with_fn'] += verses
106                 if not in_fn:
107                     counter['verses'] += verses
108                 stanza = True
109             count_text(elem.text, counter, in_fn=in_fn, stanza=stanza)
110             for child in elem:
111                 count(child, counter, in_fn=in_fn, stanza=stanza)
112                 count_text(child.tail, counter, in_fn=in_fn, stanza=stanza)
113
114         self.swap_endlines()
115
116         data = {
117             "self": Counter(),
118             "parts": [],
119             "total": {
120             }
121         }
122
123         count(self.edoc.getroot(), data['self'])
124         for k, v in data['self'].items():
125             data['total'][k] = v
126         
127         for part in self.parts(pass_part_errors=True):
128             if isinstance(part, Exception):
129                 data['parts'].append((None, {}))
130             else:
131                 data['parts'].append((part, part.get_statistics()))
132                 for k, v in data['parts'][-1][1]['total'].items():
133                     data['total'][k] = data['total'].get(k, 0) + v
134             
135         return data
136
137     @classmethod
138     def from_bytes(cls, xml, *args, **kwargs):
139         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
140
141     @classmethod
142     def from_file(cls, xmlfile, *args, **kwargs):
143
144         # first, prepare for parsing
145         if isinstance(xmlfile, six.text_type):
146             file = open(xmlfile, 'rb')
147             try:
148                 data = file.read()
149             finally:
150                 file.close()
151         else:
152             data = xmlfile.read()
153
154         if not isinstance(data, six.text_type):
155             data = data.decode('utf-8')
156
157         data = data.replace(u'\ufeff', '')
158
159         try:
160             parser = etree.XMLParser(remove_blank_text=False)
161             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
162
163             return cls(tree, *args, **kwargs)
164         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
165             raise ParseError(e)
166
167     def swap_endlines(self):
168         """Converts line breaks in stanzas into <br/> tags."""
169         # only swap inside stanzas
170         for elem in self.edoc.iter('strofa'):
171             for child in list(elem):
172                 if child.tail:
173                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
174                     ins_index = elem.index(child) + 1
175                     while len(chunks) > 1:
176                         ins = etree.Element('br')
177                         ins.tail = chunks.pop()
178                         elem.insert(ins_index, ins)
179                     child.tail = chunks.pop(0)
180             if elem.text:
181                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
182                 while len(chunks) > 1:
183                     ins = etree.Element('br')
184                     ins.tail = chunks.pop()
185                     elem.insert(0, ins)
186                 elem.text = chunks.pop(0)
187
188     def parts(self, pass_part_errors=False):
189         if self.provider is None:
190             raise NoProvider('No document provider supplied.')
191         if self.book_info is None:
192             raise NoDublinCore('No Dublin Core in document.')
193         for part_uri in self.book_info.parts:
194             try:
195                 yield self.from_file(
196                     self.provider.by_slug(part_uri.slug), provider=self.provider
197                 )
198             except Exception as e:
199                 if pass_part_errors:
200                     yield e
201                 else:
202                     raise
203
204     def chunk(self, path):
205         # convert the path to XPath
206         expr = self.path_to_xpath(path)
207         elems = self.edoc.xpath(expr)
208
209         if len(elems) == 0:
210             return None
211         else:
212             return elems[0]
213
214     def path_to_xpath(self, path):
215         parts = []
216
217         for part in path.split('/'):
218             match = re.match(r'([^\[]+)\[(\d+)\]', part)
219             if not match:
220                 parts.append(part)
221             else:
222                 tag, n = match.groups()
223                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
224
225         if parts[0] == '.':
226             parts[0] = ''
227
228         return '/'.join(parts)
229
230     def transform(self, stylesheet, **options):
231         return self.edoc.xslt(stylesheet, **options)
232
233     def update_dc(self):
234         if self.book_info:
235             parent = self.rdf_elem.getparent()
236             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
237
238     def serialize(self):
239         self.update_dc()
240         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
241
242     def merge_chunks(self, chunk_dict):
243         unmerged = []
244
245         for key, data in chunk_dict.iteritems():
246             try:
247                 xpath = self.path_to_xpath(key)
248                 node = self.edoc.xpath(xpath)[0]
249                 repl = etree.fromstring(
250                     "<%s>%s</%s>" % (node.tag, data, node.tag)
251                 )
252                 node.getparent().replace(node, repl)
253             except Exception as e:
254                 unmerged.append(repr((key, xpath, e)))
255
256         return unmerged
257
258     def clean_ed_note(self, note_tag='nota_red'):
259         """ deletes forbidden tags from nota_red """
260
261         for node in self.edoc.xpath('|'.join(
262                 '//%s//%s' % (note_tag, tag) for tag in
263                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
264             tail = node.tail
265             node.clear()
266             node.tag = 'span'
267             node.tail = tail
268
269     def fix_pa_akap(self):
270         for pa in ('pa','pe','pr','pt'):
271             for akap in self.edoc.findall(f'//{pa}/akap'):
272                 akap.getparent().set('blocks', 'true')
273                 if not akap.getparent().index(akap):
274                     akap.set('inline', 'true')
275             
276     def editors(self):
277         """Returns a set of all editors for book and its children.
278
279         :returns: set of dcparser.Person objects
280         """
281         if self.book_info is None:
282             raise NoDublinCore('No Dublin Core in document.')
283         persons = set(self.book_info.editors
284                       + self.book_info.technical_editors)
285         for child in self.parts():
286             persons.update(child.editors())
287         if None in persons:
288             persons.remove(None)
289         return persons
290
291     # Converters
292
293     def as_html(self, *args, **kwargs):
294         from librarian import html
295         return html.transform(self, *args, **kwargs)
296
297     def as_text(self, *args, **kwargs):
298         from librarian import text
299         return text.transform(self, *args, **kwargs)
300
301     def as_epub(self, *args, **kwargs):
302         from librarian import epub
303         return epub.transform(self, *args, **kwargs)
304
305     def as_pdf(self, *args, **kwargs):
306         from librarian import pdf
307         return pdf.transform(self, *args, **kwargs)
308
309     def as_mobi(self, *args, **kwargs):
310         from librarian import mobi
311         return mobi.transform(self, *args, **kwargs)
312
313     def as_fb2(self, *args, **kwargs):
314         from librarian import fb2
315         return fb2.transform(self, *args, **kwargs)
316
317     def as_cover(self, cover_class=None, *args, **kwargs):
318         if cover_class is None:
319             cover_class = make_cover
320         return cover_class(self.book_info, *args, **kwargs).output_file()
321
322     # for debugging only
323     def latex_dir(self, *args, **kwargs):
324         kwargs['latex_dir'] = True
325         from librarian import pdf
326         return pdf.transform(self, *args, **kwargs)
327
328     def save_output_file(self, output_file, output_path=None,
329                          output_dir_path=None, make_author_dir=False,
330                          ext=None):
331         if output_dir_path:
332             save_path = output_dir_path
333             if make_author_dir:
334                 save_path = os.path.join(
335                     save_path,
336                     six.text_type(self.book_info.author).encode('utf-8')
337                 )
338             save_path = os.path.join(save_path, self.book_info.url.slug)
339             if ext:
340                 save_path += '.%s' % ext
341         else:
342             save_path = output_path
343
344         output_file.save_as(save_path)