1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import unicode_literals
8 from collections import Counter
10 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
24 from .elements import WL_ELEMENTS
27 class WLElementLookup(etree.CustomElementClassLookup):
28 def lookup(self, node_type, document, namespace, name):
29 if node_type != 'element':
33 return WL_ELEMENTS[name]
36 parser = etree.XMLParser()
37 parser.set_element_class_lookup(
43 class WLDocument(object):
44 """Legacy class, to be replaced with documents.WLDocument."""
45 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
48 def __init__(self, edoc, parse_dublincore=True, provider=None,
49 strict=False, meta_fallbacks=None):
51 self.provider = provider
53 root_elem = edoc.getroot()
55 dc_path = './/' + RDFNS('RDF')
57 if root_elem.tag != 'utwor':
58 raise ValidationError(
59 "Invalid root element. Found '%s', should be 'utwor'"
64 self.rdf_elem = root_elem.find(dc_path)
66 if self.rdf_elem is None:
68 "Document must have a '%s' element." % RDFNS('RDF')
71 self.book_info = dcparser.BookInfo.from_element(
72 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
76 def get_statistics(self):
77 def count_text(text, counter, in_fn=False):
79 text = re.sub(r'\s+', ' ', text)
81 chars = len(text) if text.strip() else 0
82 words = len(text.split()) if text.strip() else 0
84 counter['chars'] += chars
85 counter['words'] += words
87 counter['chars_with_fn'] += chars
88 counter['words_with_fn'] += words
90 def count(elem, counter, in_fn=False):
91 if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
93 if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
95 count_text(elem.text, counter, in_fn=in_fn)
97 count(child, counter, in_fn=in_fn)
98 count_text(child.tail, counter, in_fn=in_fn)
108 count(self.edoc.getroot(), data['self'])
109 for k, v in data['self'].items():
112 for part in self.parts(pass_part_errors=True):
113 if isinstance(part, Exception):
114 data['parts'].append((None, {}))
116 data['parts'].append((part, part.get_statistics()))
117 for k, v in data['parts'][-1][1]['total'].items():
118 data['total'][k] = data['total'].get(k, 0) + v
123 def from_bytes(cls, xml, *args, **kwargs):
124 return cls.from_file(six.BytesIO(xml), *args, **kwargs)
127 def from_file(cls, xmlfile, *args, **kwargs):
129 # first, prepare for parsing
130 if isinstance(xmlfile, six.text_type):
131 file = open(xmlfile, 'rb')
137 data = xmlfile.read()
139 if not isinstance(data, six.text_type):
140 data = data.decode('utf-8')
142 data = data.replace(u'\ufeff', '')
145 parser = etree.XMLParser(remove_blank_text=False)
146 tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
148 return cls(tree, *args, **kwargs)
149 except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
152 def swap_endlines(self):
153 """Converts line breaks in stanzas into <br/> tags."""
154 # only swap inside stanzas
155 for elem in self.edoc.iter('strofa'):
156 for child in list(elem):
158 chunks = self.LINE_SWAP_EXPR.split(child.tail)
159 ins_index = elem.index(child) + 1
160 while len(chunks) > 1:
161 ins = etree.Element('br')
162 ins.tail = chunks.pop()
163 elem.insert(ins_index, ins)
164 child.tail = chunks.pop(0)
166 chunks = self.LINE_SWAP_EXPR.split(elem.text)
167 while len(chunks) > 1:
168 ins = etree.Element('br')
169 ins.tail = chunks.pop()
171 elem.text = chunks.pop(0)
173 def parts(self, pass_part_errors=False):
174 if self.provider is None:
175 raise NoProvider('No document provider supplied.')
176 if self.book_info is None:
177 raise NoDublinCore('No Dublin Core in document.')
178 for part_uri in self.book_info.parts:
180 yield self.from_file(
181 self.provider.by_uri(part_uri), provider=self.provider
183 except Exception as e:
189 def chunk(self, path):
190 # convert the path to XPath
191 expr = self.path_to_xpath(path)
192 elems = self.edoc.xpath(expr)
199 def path_to_xpath(self, path):
202 for part in path.split('/'):
203 match = re.match(r'([^\[]+)\[(\d+)\]', part)
207 tag, n = match.groups()
208 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
213 return '/'.join(parts)
215 def transform(self, stylesheet, **options):
216 return self.edoc.xslt(stylesheet, **options)
220 parent = self.rdf_elem.getparent()
221 parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
225 return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
227 def merge_chunks(self, chunk_dict):
230 for key, data in chunk_dict.iteritems():
232 xpath = self.path_to_xpath(key)
233 node = self.edoc.xpath(xpath)[0]
234 repl = etree.fromstring(
235 "<%s>%s</%s>" % (node.tag, data, node.tag)
237 node.getparent().replace(node, repl)
238 except Exception as e:
239 unmerged.append(repr((key, xpath, e)))
243 def clean_ed_note(self, note_tag='nota_red'):
244 """ deletes forbidden tags from nota_red """
246 for node in self.edoc.xpath('|'.join(
247 '//%s//%s' % (note_tag, tag) for tag in
248 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
254 def fix_pa_akap(self):
255 for pa in ('pa','pe','pr','pt'):
256 for akap in self.edoc.findall(f'//{pa}/akap'):
257 akap.getparent().set('blocks', 'true')
258 if not akap.getparent().index(akap):
259 akap.set('inline', 'true')
262 """Returns a set of all editors for book and its children.
264 :returns: set of dcparser.Person objects
266 if self.book_info is None:
267 raise NoDublinCore('No Dublin Core in document.')
268 persons = set(self.book_info.editors
269 + self.book_info.technical_editors)
270 for child in self.parts():
271 persons.update(child.editors())
278 def as_html(self, *args, **kwargs):
279 from librarian import html
280 return html.transform(self, *args, **kwargs)
282 def as_text(self, *args, **kwargs):
283 from librarian import text
284 return text.transform(self, *args, **kwargs)
286 def as_epub(self, *args, **kwargs):
287 from librarian import epub
288 return epub.transform(self, *args, **kwargs)
290 def as_pdf(self, *args, **kwargs):
291 from librarian import pdf
292 return pdf.transform(self, *args, **kwargs)
294 def as_mobi(self, *args, **kwargs):
295 from librarian import mobi
296 return mobi.transform(self, *args, **kwargs)
298 def as_fb2(self, *args, **kwargs):
299 from librarian import fb2
300 return fb2.transform(self, *args, **kwargs)
302 def as_cover(self, cover_class=None, *args, **kwargs):
303 if cover_class is None:
304 cover_class = make_cover
305 return cover_class(self.book_info, *args, **kwargs).output_file()
308 def latex_dir(self, *args, **kwargs):
309 kwargs['latex_dir'] = True
310 from librarian import pdf
311 return pdf.transform(self, *args, **kwargs)
313 def save_output_file(self, output_file, output_path=None,
314 output_dir_path=None, make_author_dir=False,
317 save_path = output_dir_path
319 save_path = os.path.join(
321 six.text_type(self.book_info.author).encode('utf-8')
323 save_path = os.path.join(save_path, self.book_info.url.slug)
325 save_path += '.%s' % ext
327 save_path = output_path
329 output_file.save_as(save_path)