Python 3.4-3.7 support;
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
9 from librarian import RDFNS
10 from librarian.cover import make_cover
11 from librarian import dcparser
12
13 from xml.parsers.expat import ExpatError
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
16
17 import os
18 import re
19 import six
20
21
22 class WLDocument(object):
23     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
24     provider = None
25
26     def __init__(self, edoc, parse_dublincore=True, provider=None, 
27                  strict=False, meta_fallbacks=None):
28         self.edoc = edoc
29         self.provider = provider
30
31         root_elem = edoc.getroot()
32
33         dc_path = './/' + RDFNS('RDF')
34
35         if root_elem.tag != 'utwor':
36             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
37
38         if parse_dublincore:
39             self.rdf_elem = root_elem.find(dc_path)
40
41             if self.rdf_elem is None:
42                 raise NoDublinCore('Document has no DublinCore - which is required.')
43
44             self.book_info = dcparser.BookInfo.from_element(
45                     self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
46         else:
47             self.book_info = None
48
49     @classmethod
50     def from_bytes(cls, xml, *args, **kwargs):
51         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
52
53     @classmethod
54     def from_file(cls, xmlfile, *args, **kwargs):
55
56         # first, prepare for parsing
57         if isinstance(xmlfile, six.text_type):
58             file = open(xmlfile, 'rb')
59             try:
60                 data = file.read()
61             finally:
62                 file.close()
63         else:
64             data = xmlfile.read()
65
66         if not isinstance(data, six.text_type):
67             data = data.decode('utf-8')
68
69         data = data.replace(u'\ufeff', '')
70
71         try:
72             parser = etree.XMLParser(remove_blank_text=False)
73             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
74
75             return cls(tree, *args, **kwargs)
76         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
77             raise ParseError(e)
78
79     def swap_endlines(self):
80         """Converts line breaks in stanzas into <br/> tags."""
81         # only swap inside stanzas
82         for elem in self.edoc.iter('strofa'):
83             for child in list(elem):
84                 if child.tail:
85                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
86                     ins_index = elem.index(child) + 1
87                     while len(chunks) > 1:
88                         ins = etree.Element('br')
89                         ins.tail = chunks.pop()
90                         elem.insert(ins_index, ins)
91                     child.tail = chunks.pop(0)
92             if elem.text:
93                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
94                 while len(chunks) > 1:
95                     ins = etree.Element('br')
96                     ins.tail = chunks.pop()
97                     elem.insert(0, ins)
98                 elem.text = chunks.pop(0)
99
100     def parts(self):
101         if self.provider is None:
102             raise NoProvider('No document provider supplied.')
103         if self.book_info is None:
104             raise NoDublinCore('No Dublin Core in document.')
105         for part_uri in self.book_info.parts:
106             yield self.from_file(self.provider.by_uri(part_uri), provider=self.provider)
107
108     def chunk(self, path):
109         # convert the path to XPath
110         expr = self.path_to_xpath(path)
111         elems = self.edoc.xpath(expr)
112
113         if len(elems) == 0:
114             return None
115         else:
116             return elems[0]
117
118     def path_to_xpath(self, path):
119         parts = []
120
121         for part in path.split('/'):
122             match = re.match(r'([^\[]+)\[(\d+)\]', part)
123             if not match:
124                 parts.append(part)
125             else:
126                 tag, n = match.groups()
127                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
128
129         if parts[0] == '.':
130             parts[0] = ''
131
132         return '/'.join(parts)
133
134     def transform(self, stylesheet, **options):
135         return self.edoc.xslt(stylesheet, **options)
136
137     def update_dc(self):
138         if self.book_info:
139             parent = self.rdf_elem.getparent()
140             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
141
142     def serialize(self):
143         self.update_dc()
144         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
145
146     def merge_chunks(self, chunk_dict):
147         unmerged = []
148
149         for key, data in chunk_dict.iteritems():
150             try:
151                 xpath = self.path_to_xpath(key)
152                 node = self.edoc.xpath(xpath)[0]
153                 repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
154                 node.getparent().replace(node, repl)
155             except Exception as e:
156                 unmerged.append(repr((key, xpath, e)))
157
158         return unmerged
159
160     def clean_ed_note(self, note_tag='nota_red'):
161         """ deletes forbidden tags from nota_red """
162
163         for node in self.edoc.xpath('|'.join('//%s//%s' % (note_tag, tag) for tag in
164                                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
165             tail = node.tail
166             node.clear()
167             node.tag = 'span'
168             node.tail = tail
169
170     def editors(self):
171         """Returns a set of all editors for book and its children.
172
173         :returns: set of dcparser.Person objects
174         """
175         if self.book_info is None:
176             raise NoDublinCore('No Dublin Core in document.')
177         persons = set(self.book_info.editors + self.book_info.technical_editors)
178         for child in self.parts():
179             persons.update(child.editors())
180         if None in persons:
181             persons.remove(None)
182         return persons
183
184     # Converters
185
186     def as_html(self, *args, **kwargs):
187         from librarian import html
188         return html.transform(self, *args, **kwargs)
189
190     def as_text(self, *args, **kwargs):
191         from librarian import text
192         return text.transform(self, *args, **kwargs)
193
194     def as_epub(self, *args, **kwargs):
195         from librarian import epub
196         return epub.transform(self, *args, **kwargs)
197
198     def as_pdf(self, *args, **kwargs):
199         from librarian import pdf
200         return pdf.transform(self, *args, **kwargs)
201
202     def as_mobi(self, *args, **kwargs):
203         from librarian import mobi
204         return mobi.transform(self, *args, **kwargs)
205
206     def as_fb2(self, *args, **kwargs):
207         from librarian import fb2
208         return fb2.transform(self, *args, **kwargs)
209
210     def as_cover(self, cover_class=None, *args, **kwargs):
211         if cover_class is None:
212             cover_class = make_cover
213         return cover_class(self.book_info, *args, **kwargs).output_file()
214
215     # for debugging only
216     def latex_dir(self, *args, **kwargs):
217         kwargs['latex_dir'] = True
218         from librarian import pdf
219         return pdf.transform(self, *args, **kwargs)
220
221     def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
222         if output_dir_path:
223             save_path = output_dir_path
224             if make_author_dir:
225                 save_path = os.path.join(save_path, six.text_type(self.book_info.author).encode('utf-8'))
226             save_path = os.path.join(save_path, self.book_info.url.slug)
227             if ext:
228                 save_path += '.%s' % ext
229         else:
230             save_path = output_path
231
232         output_file.save_as(save_path)