Document statistics.
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from collections import Counter
9
10 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
14
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
18
19 import os
20 import re
21 import six
22
23
24 from .elements import WL_ELEMENTS
25
26
27 class WLElementLookup(etree.CustomElementClassLookup):
28     def lookup(self, node_type, document, namespace, name):
29         if node_type != 'element':
30             return
31         if namespace:
32             return
33         return WL_ELEMENTS[name]
34
35
36 parser = etree.XMLParser()
37 parser.set_element_class_lookup(
38     WLElementLookup()
39 )
40
41
42
43 class WLDocument(object):
44     """Legacy class, to be replaced with documents.WLDocument."""
45     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
46     provider = None
47
48     def __init__(self, edoc, parse_dublincore=True, provider=None,
49                  strict=False, meta_fallbacks=None):
50         self.edoc = edoc
51         self.provider = provider
52
53         root_elem = edoc.getroot()
54
55         dc_path = './/' + RDFNS('RDF')
56
57         if root_elem.tag != 'utwor':
58             raise ValidationError(
59                 "Invalid root element. Found '%s', should be 'utwor'"
60                 % root_elem.tag
61             )
62
63         if parse_dublincore:
64             self.rdf_elem = root_elem.find(dc_path)
65
66             if self.rdf_elem is None:
67                 raise NoDublinCore(
68                     "Document must have a '%s' element." % RDFNS('RDF')
69                 )
70
71             self.book_info = dcparser.BookInfo.from_element(
72                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
73         else:
74             self.book_info = None
75
76     def get_statistics(self):
77         def count_text(text, counter, in_fn=False):
78             if text:
79                 text = re.sub(r'\s+', ' ', text)
80
81                 chars = len(text) if text.strip() else 0
82                 words = len(text.split()) if text.strip() else 0
83                 
84                 counter['chars'] += chars
85                 counter['words'] += words
86                 if not in_fn:
87                     counter['chars_with_fn'] += chars
88                     counter['words_with_fn'] += words
89                 
90         def count(elem, counter, in_fn=False):
91             if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
92                 return
93             if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
94                 in_fn = True
95             count_text(elem.text, counter, in_fn=in_fn)
96             for child in elem:
97                 count(child, counter, in_fn=in_fn)
98                 count_text(child.tail, counter, in_fn=in_fn)
99             
100             
101         data = {
102             "self": Counter(),
103             "parts": [],
104             "total": {
105             }
106         }
107
108         count(self.edoc.getroot(), data['self'])
109         for k, v in data['self'].items():
110             data['total'][k] = v
111         
112         for part in self.parts(pass_part_errors=True):
113             if isinstance(part, Exception):
114                 data['parts'].append((None, {}))
115             else:
116                 data['parts'].append((part, part.get_statistics()))
117                 for k, v in data['parts'][-1][1]['total'].items():
118                     data['total'][k] += v
119             
120         return data
121
122     @classmethod
123     def from_bytes(cls, xml, *args, **kwargs):
124         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
125
126     @classmethod
127     def from_file(cls, xmlfile, *args, **kwargs):
128
129         # first, prepare for parsing
130         if isinstance(xmlfile, six.text_type):
131             file = open(xmlfile, 'rb')
132             try:
133                 data = file.read()
134             finally:
135                 file.close()
136         else:
137             data = xmlfile.read()
138
139         if not isinstance(data, six.text_type):
140             data = data.decode('utf-8')
141
142         data = data.replace(u'\ufeff', '')
143
144         try:
145             parser = etree.XMLParser(remove_blank_text=False)
146             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
147
148             return cls(tree, *args, **kwargs)
149         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
150             raise ParseError(e)
151
152     def swap_endlines(self):
153         """Converts line breaks in stanzas into <br/> tags."""
154         # only swap inside stanzas
155         for elem in self.edoc.iter('strofa'):
156             for child in list(elem):
157                 if child.tail:
158                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
159                     ins_index = elem.index(child) + 1
160                     while len(chunks) > 1:
161                         ins = etree.Element('br')
162                         ins.tail = chunks.pop()
163                         elem.insert(ins_index, ins)
164                     child.tail = chunks.pop(0)
165             if elem.text:
166                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
167                 while len(chunks) > 1:
168                     ins = etree.Element('br')
169                     ins.tail = chunks.pop()
170                     elem.insert(0, ins)
171                 elem.text = chunks.pop(0)
172
173     def parts(self, pass_part_errors=False):
174         if self.provider is None:
175             raise NoProvider('No document provider supplied.')
176         if self.book_info is None:
177             raise NoDublinCore('No Dublin Core in document.')
178         for part_uri in self.book_info.parts:
179             try:
180                 yield self.from_file(
181                     self.provider.by_uri(part_uri), provider=self.provider
182                 )
183             except Exception as e:
184                 if pass_part_errors:
185                     yield e
186                 else:
187                     raise
188
189     def chunk(self, path):
190         # convert the path to XPath
191         expr = self.path_to_xpath(path)
192         elems = self.edoc.xpath(expr)
193
194         if len(elems) == 0:
195             return None
196         else:
197             return elems[0]
198
199     def path_to_xpath(self, path):
200         parts = []
201
202         for part in path.split('/'):
203             match = re.match(r'([^\[]+)\[(\d+)\]', part)
204             if not match:
205                 parts.append(part)
206             else:
207                 tag, n = match.groups()
208                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
209
210         if parts[0] == '.':
211             parts[0] = ''
212
213         return '/'.join(parts)
214
215     def transform(self, stylesheet, **options):
216         return self.edoc.xslt(stylesheet, **options)
217
218     def update_dc(self):
219         if self.book_info:
220             parent = self.rdf_elem.getparent()
221             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
222
223     def serialize(self):
224         self.update_dc()
225         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
226
227     def merge_chunks(self, chunk_dict):
228         unmerged = []
229
230         for key, data in chunk_dict.iteritems():
231             try:
232                 xpath = self.path_to_xpath(key)
233                 node = self.edoc.xpath(xpath)[0]
234                 repl = etree.fromstring(
235                     "<%s>%s</%s>" % (node.tag, data, node.tag)
236                 )
237                 node.getparent().replace(node, repl)
238             except Exception as e:
239                 unmerged.append(repr((key, xpath, e)))
240
241         return unmerged
242
243     def clean_ed_note(self, note_tag='nota_red'):
244         """ deletes forbidden tags from nota_red """
245
246         for node in self.edoc.xpath('|'.join(
247                 '//%s//%s' % (note_tag, tag) for tag in
248                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
249             tail = node.tail
250             node.clear()
251             node.tag = 'span'
252             node.tail = tail
253
254     def fix_pa_akap(self):
255         for pa in ('pa','pe','pr','pt'):
256             for akap in self.edoc.findall(f'//{pa}/akap'):
257                 akap.getparent().set('blocks', 'true')
258                 if not akap.getparent().index(akap):
259                     akap.set('inline', 'true')
260             
261     def editors(self):
262         """Returns a set of all editors for book and its children.
263
264         :returns: set of dcparser.Person objects
265         """
266         if self.book_info is None:
267             raise NoDublinCore('No Dublin Core in document.')
268         persons = set(self.book_info.editors
269                       + self.book_info.technical_editors)
270         for child in self.parts():
271             persons.update(child.editors())
272         if None in persons:
273             persons.remove(None)
274         return persons
275
276     # Converters
277
278     def as_html(self, *args, **kwargs):
279         from librarian import html
280         return html.transform(self, *args, **kwargs)
281
282     def as_text(self, *args, **kwargs):
283         from librarian import text
284         return text.transform(self, *args, **kwargs)
285
286     def as_epub(self, *args, **kwargs):
287         from librarian import epub
288         return epub.transform(self, *args, **kwargs)
289
290     def as_pdf(self, *args, **kwargs):
291         from librarian import pdf
292         return pdf.transform(self, *args, **kwargs)
293
294     def as_mobi(self, *args, **kwargs):
295         from librarian import mobi
296         return mobi.transform(self, *args, **kwargs)
297
298     def as_fb2(self, *args, **kwargs):
299         from librarian import fb2
300         return fb2.transform(self, *args, **kwargs)
301
302     def as_cover(self, cover_class=None, *args, **kwargs):
303         if cover_class is None:
304             cover_class = make_cover
305         return cover_class(self.book_info, *args, **kwargs).output_file()
306
307     # for debugging only
308     def latex_dir(self, *args, **kwargs):
309         kwargs['latex_dir'] = True
310         from librarian import pdf
311         return pdf.transform(self, *args, **kwargs)
312
313     def save_output_file(self, output_file, output_path=None,
314                          output_dir_path=None, make_author_dir=False,
315                          ext=None):
316         if output_dir_path:
317             save_path = output_dir_path
318             if make_author_dir:
319                 save_path = os.path.join(
320                     save_path,
321                     six.text_type(self.book_info.author).encode('utf-8')
322                 )
323             save_path = os.path.join(save_path, self.book_info.url.slug)
324             if ext:
325                 save_path += '.%s' % ext
326         else:
327             save_path = output_path
328
329         output_file.save_as(save_path)