1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import unicode_literals
8 from collections import Counter
10 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
24 from .elements import WL_ELEMENTS
27 class WLElementLookup(etree.CustomElementClassLookup):
28 def lookup(self, node_type, document, namespace, name):
29 if node_type != 'element':
34 return WL_ELEMENTS[name]
39 parser = etree.XMLParser()
40 parser.set_element_class_lookup(
46 class WLDocument(object):
47 """Legacy class, to be replaced with documents.WLDocument."""
48 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
51 def __init__(self, edoc, parse_dublincore=True, provider=None,
52 strict=False, meta_fallbacks=None):
54 self.provider = provider
56 root_elem = edoc.getroot()
58 dc_path = './/' + RDFNS('RDF')
60 if root_elem.tag != 'utwor':
61 raise ValidationError(
62 "Invalid root element. Found '%s', should be 'utwor'"
67 self.rdf_elem = root_elem.find(dc_path)
69 if self.rdf_elem is None:
71 "Document must have a '%s' element." % RDFNS('RDF')
74 self.book_info = dcparser.BookInfo.from_element(
75 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
79 def get_statistics(self):
80 def count_text(text, counter, in_fn=False):
82 text = re.sub(r'\s+', ' ', text)
84 chars = len(text) if text.strip() else 0
85 words = len(text.split()) if text.strip() else 0
87 counter['chars'] += chars
88 counter['words'] += words
90 counter['chars_with_fn'] += chars
91 counter['words_with_fn'] += words
93 def count(elem, counter, in_fn=False):
94 if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
96 if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
98 count_text(elem.text, counter, in_fn=in_fn)
100 count(child, counter, in_fn=in_fn)
101 count_text(child.tail, counter, in_fn=in_fn)
111 count(self.edoc.getroot(), data['self'])
112 for k, v in data['self'].items():
115 for part in self.parts(pass_part_errors=True):
116 if isinstance(part, Exception):
117 data['parts'].append((None, {}))
119 data['parts'].append((part, part.get_statistics()))
120 for k, v in data['parts'][-1][1]['total'].items():
121 data['total'][k] = data['total'].get(k, 0) + v
126 def from_bytes(cls, xml, *args, **kwargs):
127 return cls.from_file(six.BytesIO(xml), *args, **kwargs)
130 def from_file(cls, xmlfile, *args, **kwargs):
132 # first, prepare for parsing
133 if isinstance(xmlfile, six.text_type):
134 file = open(xmlfile, 'rb')
140 data = xmlfile.read()
142 if not isinstance(data, six.text_type):
143 data = data.decode('utf-8')
145 data = data.replace(u'\ufeff', '')
148 parser = etree.XMLParser(remove_blank_text=False)
149 tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
151 return cls(tree, *args, **kwargs)
152 except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
155 def swap_endlines(self):
156 """Converts line breaks in stanzas into <br/> tags."""
157 # only swap inside stanzas
158 for elem in self.edoc.iter('strofa'):
159 for child in list(elem):
161 chunks = self.LINE_SWAP_EXPR.split(child.tail)
162 ins_index = elem.index(child) + 1
163 while len(chunks) > 1:
164 ins = etree.Element('br')
165 ins.tail = chunks.pop()
166 elem.insert(ins_index, ins)
167 child.tail = chunks.pop(0)
169 chunks = self.LINE_SWAP_EXPR.split(elem.text)
170 while len(chunks) > 1:
171 ins = etree.Element('br')
172 ins.tail = chunks.pop()
174 elem.text = chunks.pop(0)
176 def parts(self, pass_part_errors=False):
177 if self.provider is None:
178 raise NoProvider('No document provider supplied.')
179 if self.book_info is None:
180 raise NoDublinCore('No Dublin Core in document.')
181 for part_uri in self.book_info.parts:
183 yield self.from_file(
184 self.provider.by_slug(part_uri.slug), provider=self.provider
186 except Exception as e:
192 def chunk(self, path):
193 # convert the path to XPath
194 expr = self.path_to_xpath(path)
195 elems = self.edoc.xpath(expr)
202 def path_to_xpath(self, path):
205 for part in path.split('/'):
206 match = re.match(r'([^\[]+)\[(\d+)\]', part)
210 tag, n = match.groups()
211 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
216 return '/'.join(parts)
218 def transform(self, stylesheet, **options):
219 return self.edoc.xslt(stylesheet, **options)
223 parent = self.rdf_elem.getparent()
224 parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
228 return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
230 def merge_chunks(self, chunk_dict):
233 for key, data in chunk_dict.iteritems():
235 xpath = self.path_to_xpath(key)
236 node = self.edoc.xpath(xpath)[0]
237 repl = etree.fromstring(
238 "<%s>%s</%s>" % (node.tag, data, node.tag)
240 node.getparent().replace(node, repl)
241 except Exception as e:
242 unmerged.append(repr((key, xpath, e)))
246 def clean_ed_note(self, note_tag='nota_red'):
247 """ deletes forbidden tags from nota_red """
249 for node in self.edoc.xpath('|'.join(
250 '//%s//%s' % (note_tag, tag) for tag in
251 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
257 def fix_pa_akap(self):
258 for pa in ('pa','pe','pr','pt'):
259 for akap in self.edoc.findall(f'//{pa}/akap'):
260 akap.getparent().set('blocks', 'true')
261 if not akap.getparent().index(akap):
262 akap.set('inline', 'true')
265 """Returns a set of all editors for book and its children.
267 :returns: set of dcparser.Person objects
269 if self.book_info is None:
270 raise NoDublinCore('No Dublin Core in document.')
271 persons = set(self.book_info.editors
272 + self.book_info.technical_editors)
273 for child in self.parts():
274 persons.update(child.editors())
281 def as_html(self, *args, **kwargs):
282 from librarian import html
283 return html.transform(self, *args, **kwargs)
285 def as_text(self, *args, **kwargs):
286 from librarian import text
287 return text.transform(self, *args, **kwargs)
289 def as_epub(self, *args, **kwargs):
290 from librarian import epub
291 return epub.transform(self, *args, **kwargs)
293 def as_pdf(self, *args, **kwargs):
294 from librarian import pdf
295 return pdf.transform(self, *args, **kwargs)
297 def as_mobi(self, *args, **kwargs):
298 from librarian import mobi
299 return mobi.transform(self, *args, **kwargs)
301 def as_fb2(self, *args, **kwargs):
302 from librarian import fb2
303 return fb2.transform(self, *args, **kwargs)
305 def as_cover(self, cover_class=None, *args, **kwargs):
306 if cover_class is None:
307 cover_class = make_cover
308 return cover_class(self.book_info, *args, **kwargs).output_file()
311 def latex_dir(self, *args, **kwargs):
312 kwargs['latex_dir'] = True
313 from librarian import pdf
314 return pdf.transform(self, *args, **kwargs)
316 def save_output_file(self, output_file, output_path=None,
317 output_dir_path=None, make_author_dir=False,
320 save_path = output_dir_path
322 save_path = os.path.join(
324 six.text_type(self.book_info.author).encode('utf-8')
326 save_path = os.path.join(save_path, self.book_info.url.slug)
328 save_path += '.%s' % ext
330 save_path = output_path
332 output_file.save_as(save_path)