2bb9509911a01cbbf6377fe27ee6d37bab90b38f
[librarian.git] / src / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from __future__ import unicode_literals
7
8 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
9 from librarian import RDFNS
10 from librarian.cover import make_cover
11 from librarian import dcparser
12
13 from xml.parsers.expat import ExpatError
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
16
17 import os
18 import re
19 import six
20
21
22 class WLDocument(object):
23     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
24     provider = None
25
26     def __init__(self, edoc, parse_dublincore=True, provider=None,
27                  strict=False, meta_fallbacks=None):
28         self.edoc = edoc
29         self.provider = provider
30
31         root_elem = edoc.getroot()
32
33         dc_path = './/' + RDFNS('RDF')
34
35         if root_elem.tag != 'utwor':
36             raise ValidationError(
37                 "Invalid root element. Found '%s', should be 'utwor'"
38                 % root_elem.tag
39             )
40
41         if parse_dublincore:
42             self.rdf_elem = root_elem.find(dc_path)
43
44             if self.rdf_elem is None:
45                 raise NoDublinCore(
46                     "Document must have a '%s' element." % RDFNS('RDF')
47                 )
48
49             self.book_info = dcparser.BookInfo.from_element(
50                 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
51         else:
52             self.book_info = None
53
54     @classmethod
55     def from_bytes(cls, xml, *args, **kwargs):
56         return cls.from_file(six.BytesIO(xml), *args, **kwargs)
57
58     @classmethod
59     def from_file(cls, xmlfile, *args, **kwargs):
60
61         # first, prepare for parsing
62         if isinstance(xmlfile, six.text_type):
63             file = open(xmlfile, 'rb')
64             try:
65                 data = file.read()
66             finally:
67                 file.close()
68         else:
69             data = xmlfile.read()
70
71         if not isinstance(data, six.text_type):
72             data = data.decode('utf-8')
73
74         data = data.replace(u'\ufeff', '')
75
76         try:
77             parser = etree.XMLParser(remove_blank_text=False)
78             tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
79
80             return cls(tree, *args, **kwargs)
81         except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
82             raise ParseError(e)
83
84     def swap_endlines(self):
85         """Converts line breaks in stanzas into <br/> tags."""
86         # only swap inside stanzas
87         for elem in self.edoc.iter('strofa'):
88             for child in list(elem):
89                 if child.tail:
90                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
91                     ins_index = elem.index(child) + 1
92                     while len(chunks) > 1:
93                         ins = etree.Element('br')
94                         ins.tail = chunks.pop()
95                         elem.insert(ins_index, ins)
96                     child.tail = chunks.pop(0)
97             if elem.text:
98                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
99                 while len(chunks) > 1:
100                     ins = etree.Element('br')
101                     ins.tail = chunks.pop()
102                     elem.insert(0, ins)
103                 elem.text = chunks.pop(0)
104
105     def parts(self):
106         if self.provider is None:
107             raise NoProvider('No document provider supplied.')
108         if self.book_info is None:
109             raise NoDublinCore('No Dublin Core in document.')
110         for part_uri in self.book_info.parts:
111             yield self.from_file(
112                 self.provider.by_uri(part_uri), provider=self.provider
113             )
114
115     def chunk(self, path):
116         # convert the path to XPath
117         expr = self.path_to_xpath(path)
118         elems = self.edoc.xpath(expr)
119
120         if len(elems) == 0:
121             return None
122         else:
123             return elems[0]
124
125     def path_to_xpath(self, path):
126         parts = []
127
128         for part in path.split('/'):
129             match = re.match(r'([^\[]+)\[(\d+)\]', part)
130             if not match:
131                 parts.append(part)
132             else:
133                 tag, n = match.groups()
134                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
135
136         if parts[0] == '.':
137             parts[0] = ''
138
139         return '/'.join(parts)
140
141     def transform(self, stylesheet, **options):
142         return self.edoc.xslt(stylesheet, **options)
143
144     def update_dc(self):
145         if self.book_info:
146             parent = self.rdf_elem.getparent()
147             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
148
149     def serialize(self):
150         self.update_dc()
151         return etree.tostring(self.edoc, encoding='unicode', pretty_print=True)
152
153     def merge_chunks(self, chunk_dict):
154         unmerged = []
155
156         for key, data in chunk_dict.iteritems():
157             try:
158                 xpath = self.path_to_xpath(key)
159                 node = self.edoc.xpath(xpath)[0]
160                 repl = etree.fromstring(
161                     "<%s>%s</%s>" % (node.tag, data, node.tag)
162                 )
163                 node.getparent().replace(node, repl)
164             except Exception as e:
165                 unmerged.append(repr((key, xpath, e)))
166
167         return unmerged
168
169     def clean_ed_note(self, note_tag='nota_red'):
170         """ deletes forbidden tags from nota_red """
171
172         for node in self.edoc.xpath('|'.join(
173                 '//%s//%s' % (note_tag, tag) for tag in
174                 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
175             tail = node.tail
176             node.clear()
177             node.tag = 'span'
178             node.tail = tail
179
180     def editors(self):
181         """Returns a set of all editors for book and its children.
182
183         :returns: set of dcparser.Person objects
184         """
185         if self.book_info is None:
186             raise NoDublinCore('No Dublin Core in document.')
187         persons = set(self.book_info.editors
188                       + self.book_info.technical_editors)
189         for child in self.parts():
190             persons.update(child.editors())
191         if None in persons:
192             persons.remove(None)
193         return persons
194
195     # Converters
196
197     def as_html(self, *args, **kwargs):
198         from librarian import html
199         return html.transform(self, *args, **kwargs)
200
201     def as_text(self, *args, **kwargs):
202         from librarian import text
203         return text.transform(self, *args, **kwargs)
204
205     def as_epub(self, *args, **kwargs):
206         from librarian import epub
207         return epub.transform(self, *args, **kwargs)
208
209     def as_pdf(self, *args, **kwargs):
210         from librarian import pdf
211         return pdf.transform(self, *args, **kwargs)
212
213     def as_mobi(self, *args, **kwargs):
214         from librarian import mobi
215         return mobi.transform(self, *args, **kwargs)
216
217     def as_fb2(self, *args, **kwargs):
218         from librarian import fb2
219         return fb2.transform(self, *args, **kwargs)
220
221     def as_cover(self, cover_class=None, *args, **kwargs):
222         if cover_class is None:
223             cover_class = make_cover
224         return cover_class(self.book_info, *args, **kwargs).output_file()
225
226     # for debugging only
227     def latex_dir(self, *args, **kwargs):
228         kwargs['latex_dir'] = True
229         from librarian import pdf
230         return pdf.transform(self, *args, **kwargs)
231
232     def save_output_file(self, output_file, output_path=None,
233                          output_dir_path=None, make_author_dir=False,
234                          ext=None):
235         if output_dir_path:
236             save_path = output_dir_path
237             if make_author_dir:
238                 save_path = os.path.join(
239                     save_path,
240                     six.text_type(self.book_info.author).encode('utf-8')
241                 )
242             save_path = os.path.join(save_path, self.book_info.url.slug)
243             if ext:
244                 save_path += '.%s' % ext
245         else:
246             save_path = output_path
247
248         output_file.save_as(save_path)