1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import unicode_literals
8 from collections import Counter
10 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
11 from librarian import RDFNS
12 from librarian.cover import make_cover
13 from librarian import dcparser
15 from xml.parsers.expat import ExpatError
16 from lxml import etree
17 from lxml.etree import XMLSyntaxError, XSLTApplyError
24 from .elements import WL_ELEMENTS
27 class WLElementLookup(etree.CustomElementClassLookup):
28 def lookup(self, node_type, document, namespace, name):
29 if node_type != 'element':
34 return WL_ELEMENTS[name]
39 parser = etree.XMLParser()
40 parser.set_element_class_lookup(
46 class WLDocument(object):
47 """Legacy class, to be replaced with documents.WLDocument."""
48 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
51 def __init__(self, edoc, parse_dublincore=True, provider=None,
52 strict=False, meta_fallbacks=None):
54 self.provider = provider
56 root_elem = edoc.getroot()
58 dc_path = './/' + RDFNS('RDF')
60 if root_elem.tag != 'utwor':
61 raise ValidationError(
62 "Invalid root element. Found '%s', should be 'utwor'"
67 self.rdf_elem = root_elem.find(dc_path)
69 if self.rdf_elem is None:
71 "Document must have a '%s' element." % RDFNS('RDF')
74 self.book_info = dcparser.BookInfo.from_element(
75 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
79 def get_statistics(self):
80 def count_text(text, counter, in_fn=False, stanza=False):
82 text = re.sub(r'\s+', ' ', text)
84 chars = len(text) if text.strip() else 0
85 words = len(text.split()) if text.strip() else 0
87 counter['chars_with_fn'] += chars
88 counter['words_with_fn'] += words
90 counter['chars'] += chars
91 counter['words'] += words
93 counter['chars_out_verse_with_fn'] += chars
95 counter['chars_out_verse'] += chars
97 def count(elem, counter, in_fn=False, stanza=False):
98 if elem.tag in (RDFNS('RDF'), 'nota_red', 'abstrakt', 'uwaga', 'ekstra'):
100 if not in_fn and elem.tag in ('pa', 'pe', 'pr', 'pt', 'motyw'):
102 if elem.tag == 'strofa':
104 verses = len(elem.findall('.//br')) + 1
105 counter['verses_with_fn'] += verses
107 counter['verses'] += verses
109 count_text(elem.text, counter, in_fn=in_fn, stanza=stanza)
111 count(child, counter, in_fn=in_fn, stanza=stanza)
112 count_text(child.tail, counter, in_fn=in_fn, stanza=stanza)
123 count(self.edoc.getroot(), data['self'])
124 for k, v in data['self'].items():
127 for part in self.parts(pass_part_errors=True):
128 if isinstance(part, Exception):
129 data['parts'].append((None, {}))
131 data['parts'].append((part, part.get_statistics()))
132 for k, v in data['parts'][-1][1]['total'].items():
133 data['total'][k] = data['total'].get(k, 0) + v
138 def from_bytes(cls, xml, *args, **kwargs):
139 return cls.from_file(six.BytesIO(xml), *args, **kwargs)
142 def from_file(cls, xmlfile, *args, **kwargs):
144 # first, prepare for parsing
145 if isinstance(xmlfile, six.text_type):
146 file = open(xmlfile, 'rb')
152 data = xmlfile.read()
154 if not isinstance(data, six.text_type):
155 data = data.decode('utf-8')
157 data = data.replace(u'\ufeff', '')
160 parser = etree.XMLParser(remove_blank_text=False)
161 tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
163 return cls(tree, *args, **kwargs)
164 except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
167 def swap_endlines(self):
168 """Converts line breaks in stanzas into <br/> tags."""
169 # only swap inside stanzas
170 for elem in self.edoc.iter('strofa'):
171 for child in list(elem):
173 chunks = self.LINE_SWAP_EXPR.split(child.tail)
174 ins_index = elem.index(child) + 1
175 while len(chunks) > 1:
176 ins = etree.Element('br')
177 ins.tail = chunks.pop()
178 elem.insert(ins_index, ins)
179 child.tail = chunks.pop(0)
181 chunks = self.LINE_SWAP_EXPR.split(elem.text)
182 while len(chunks) > 1:
183 ins = etree.Element('br')
184 ins.tail = chunks.pop()
186 elem.text = chunks.pop(0)
188 def parts(self, pass_part_errors=False):
189 if self.provider is None:
190 raise NoProvider('No document provider supplied.')
191 if self.book_info is None:
192 raise NoDublinCore('No Dublin Core in document.')
193 for part_uri in self.book_info.parts:
195 yield self.from_file(
196 self.provider.by_slug(part_uri.slug), provider=self.provider
198 except Exception as e:
204 def chunk(self, path):
205 # convert the path to XPath
206 expr = self.path_to_xpath(path)
207 elems = self.edoc.xpath(expr)
214 def path_to_xpath(self, path):
217 for part in path.split('/'):
218 match = re.match(r'([^\[]+)\[(\d+)\]', part)
222 tag, n = match.groups()
223 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
228 return '/'.join(parts)
230 def transform(self, stylesheet, **options):
231 return self.edoc.xslt(stylesheet, **options)
235 parent = self.rdf_elem.getparent()
236 parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
240 return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
242 def merge_chunks(self, chunk_dict):
245 for key, data in chunk_dict.iteritems():
247 xpath = self.path_to_xpath(key)
248 node = self.edoc.xpath(xpath)[0]
249 repl = etree.fromstring(
250 "<%s>%s</%s>" % (node.tag, data, node.tag)
252 node.getparent().replace(node, repl)
253 except Exception as e:
254 unmerged.append(repr((key, xpath, e)))
258 def clean_ed_note(self, note_tag='nota_red'):
259 """ deletes forbidden tags from nota_red """
261 for node in self.edoc.xpath('|'.join(
262 '//%s//%s' % (note_tag, tag) for tag in
263 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
269 def fix_pa_akap(self):
270 for pa in ('pa','pe','pr','pt'):
271 for akap in self.edoc.findall(f'//{pa}/akap'):
272 akap.getparent().set('blocks', 'true')
273 if not akap.getparent().index(akap):
274 akap.set('inline', 'true')
277 """Returns a set of all editors for book and its children.
279 :returns: set of dcparser.Person objects
281 if self.book_info is None:
282 raise NoDublinCore('No Dublin Core in document.')
283 persons = set(self.book_info.editors
284 + self.book_info.technical_editors)
285 for child in self.parts():
286 persons.update(child.editors())
293 def as_html(self, *args, **kwargs):
294 from librarian import html
295 return html.transform(self, *args, **kwargs)
297 def as_text(self, *args, **kwargs):
298 from librarian import text
299 return text.transform(self, *args, **kwargs)
301 def as_epub(self, *args, **kwargs):
302 from librarian import epub
303 return epub.transform(self, *args, **kwargs)
305 def as_pdf(self, *args, **kwargs):
306 from librarian import pdf
307 return pdf.transform(self, *args, **kwargs)
309 def as_mobi(self, *args, **kwargs):
310 from librarian import mobi
311 return mobi.transform(self, *args, **kwargs)
313 def as_fb2(self, *args, **kwargs):
314 from librarian import fb2
315 return fb2.transform(self, *args, **kwargs)
317 def as_cover(self, cover_class=None, *args, **kwargs):
318 if cover_class is None:
319 cover_class = make_cover
320 return cover_class(self.book_info, *args, **kwargs).output_file()
323 def latex_dir(self, *args, **kwargs):
324 kwargs['latex_dir'] = True
325 from librarian import pdf
326 return pdf.transform(self, *args, **kwargs)
328 def save_output_file(self, output_file, output_path=None,
329 output_dir_path=None, make_author_dir=False,
332 save_path = output_dir_path
334 save_path = os.path.join(
336 six.text_type(self.book_info.author).encode('utf-8')
338 save_path = os.path.join(save_path, self.book_info.url.slug)
340 save_path += '.%s' % ext
342 save_path = output_path
344 output_file.save_as(save_path)