1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import unicode_literals
8 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
9 from librarian import RDFNS
10 from librarian.cover import make_cover
11 from librarian import dcparser
13 from xml.parsers.expat import ExpatError
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 class WLDocument(object):
23 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
26 def __init__(self, edoc, parse_dublincore=True, provider=None,
27 strict=False, meta_fallbacks=None):
29 self.provider = provider
31 root_elem = edoc.getroot()
33 dc_path = './/' + RDFNS('RDF')
35 if root_elem.tag != 'utwor':
36 raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
39 self.rdf_elem = root_elem.find(dc_path)
41 if self.rdf_elem is None:
42 raise NoDublinCore('Document has no DublinCore - which is required.')
44 self.book_info = dcparser.BookInfo.from_element(
45 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
50 def from_bytes(cls, xml, *args, **kwargs):
51 return cls.from_file(six.BytesIO(xml), *args, **kwargs)
54 def from_file(cls, xmlfile, *args, **kwargs):
56 # first, prepare for parsing
57 if isinstance(xmlfile, six.text_type):
58 file = open(xmlfile, 'rb')
66 if not isinstance(data, six.text_type):
67 data = data.decode('utf-8')
69 data = data.replace(u'\ufeff', '')
72 parser = etree.XMLParser(remove_blank_text=False)
73 tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
75 return cls(tree, *args, **kwargs)
76 except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
79 def swap_endlines(self):
80 """Converts line breaks in stanzas into <br/> tags."""
81 # only swap inside stanzas
82 for elem in self.edoc.iter('strofa'):
83 for child in list(elem):
85 chunks = self.LINE_SWAP_EXPR.split(child.tail)
86 ins_index = elem.index(child) + 1
87 while len(chunks) > 1:
88 ins = etree.Element('br')
89 ins.tail = chunks.pop()
90 elem.insert(ins_index, ins)
91 child.tail = chunks.pop(0)
93 chunks = self.LINE_SWAP_EXPR.split(elem.text)
94 while len(chunks) > 1:
95 ins = etree.Element('br')
96 ins.tail = chunks.pop()
98 elem.text = chunks.pop(0)
101 if self.provider is None:
102 raise NoProvider('No document provider supplied.')
103 if self.book_info is None:
104 raise NoDublinCore('No Dublin Core in document.')
105 for part_uri in self.book_info.parts:
106 yield self.from_file(self.provider.by_uri(part_uri), provider=self.provider)
108 def chunk(self, path):
109 # convert the path to XPath
110 expr = self.path_to_xpath(path)
111 elems = self.edoc.xpath(expr)
118 def path_to_xpath(self, path):
121 for part in path.split('/'):
122 match = re.match(r'([^\[]+)\[(\d+)\]', part)
126 tag, n = match.groups()
127 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
132 return '/'.join(parts)
134 def transform(self, stylesheet, **options):
135 return self.edoc.xslt(stylesheet, **options)
139 parent = self.rdf_elem.getparent()
140 parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
144 return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
146 def merge_chunks(self, chunk_dict):
149 for key, data in chunk_dict.iteritems():
151 xpath = self.path_to_xpath(key)
152 node = self.edoc.xpath(xpath)[0]
153 repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
154 node.getparent().replace(node, repl)
155 except Exception as e:
156 unmerged.append(repr((key, xpath, e)))
160 def clean_ed_note(self, note_tag='nota_red'):
161 """ deletes forbidden tags from nota_red """
163 for node in self.edoc.xpath('|'.join('//%s//%s' % (note_tag, tag) for tag in
164 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
171 """Returns a set of all editors for book and its children.
173 :returns: set of dcparser.Person objects
175 if self.book_info is None:
176 raise NoDublinCore('No Dublin Core in document.')
177 persons = set(self.book_info.editors + self.book_info.technical_editors)
178 for child in self.parts():
179 persons.update(child.editors())
186 def as_html(self, *args, **kwargs):
187 from librarian import html
188 return html.transform(self, *args, **kwargs)
190 def as_text(self, *args, **kwargs):
191 from librarian import text
192 return text.transform(self, *args, **kwargs)
194 def as_epub(self, *args, **kwargs):
195 from librarian import epub
196 return epub.transform(self, *args, **kwargs)
198 def as_pdf(self, *args, **kwargs):
199 from librarian import pdf
200 return pdf.transform(self, *args, **kwargs)
202 def as_mobi(self, *args, **kwargs):
203 from librarian import mobi
204 return mobi.transform(self, *args, **kwargs)
206 def as_fb2(self, *args, **kwargs):
207 from librarian import fb2
208 return fb2.transform(self, *args, **kwargs)
210 def as_cover(self, cover_class=None, *args, **kwargs):
211 if cover_class is None:
212 cover_class = make_cover
213 return cover_class(self.book_info, *args, **kwargs).output_file()
216 def latex_dir(self, *args, **kwargs):
217 kwargs['latex_dir'] = True
218 from librarian import pdf
219 return pdf.transform(self, *args, **kwargs)
221 def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
223 save_path = output_dir_path
225 save_path = os.path.join(save_path, six.text_type(self.book_info.author).encode('utf-8'))
226 save_path = os.path.join(save_path, self.book_info.url.slug)
228 save_path += '.%s' % ext
230 save_path = output_path
232 output_file.save_as(save_path)