Fix HTML test.
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian.cover import DefaultEbookCover
9 from librarian import dcparser
10
11 from xml.parsers.expat import ExpatError
12 from lxml import etree
13 from lxml.etree import XMLSyntaxError, XSLTApplyError
14
15 import os
16 import re
17 from StringIO import StringIO
18
19 class WLDocument(object):
20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
21     provider = None
22
23     def __init__(self, edoc, parse_dublincore=True, provider=None, 
24                     strict=False, meta_fallbacks=None):
25         self.edoc = edoc
26         self.provider = provider
27
28         root_elem = edoc.getroot()
29
30         dc_path = './/' + RDFNS('RDF')
31
32         if root_elem.tag != 'utwor':
33             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
34
35         if parse_dublincore:
36             self.rdf_elem = root_elem.find(dc_path)
37
38             if self.rdf_elem is None:
39                 raise NoDublinCore('Document has no DublinCore - which is required.')
40
41             self.book_info = dcparser.BookInfo.from_element(
42                     self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
43         else:
44             self.book_info = None
45
46     @classmethod
47     def from_string(cls, xml, *args, **kwargs):
48         return cls.from_file(StringIO(xml), *args, **kwargs)
49
50     @classmethod
51     def from_file(cls, xmlfile, *args, **kwargs):
52
53         # first, prepare for parsing
54         if isinstance(xmlfile, basestring):
55             file = open(xmlfile, 'rb')
56             try:
57                 data = file.read()
58             finally:
59                 file.close()
60         else:
61             data = xmlfile.read()
62
63         if not isinstance(data, unicode):
64             data = data.decode('utf-8')
65
66         data = data.replace(u'\ufeff', '')
67
68         try:
69             parser = etree.XMLParser(remove_blank_text=False)
70             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
71
72             return cls(tree, *args, **kwargs)
73         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
74             raise ParseError(e)
75
76     def swap_endlines(self):
77         """Converts line breaks in stanzas into <br/> tags."""
78         # only swap inside stanzas
79         for elem in self.edoc.iter('strofa'):
80             for child in list(elem):
81                 if child.tail:
82                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
83                     ins_index = elem.index(child) + 1
84                     while len(chunks) > 1:
85                         ins = etree.Element('br')
86                         ins.tail = chunks.pop()
87                         elem.insert(ins_index, ins)
88                     child.tail = chunks.pop(0)
89             if elem.text:
90                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
91                 while len(chunks) > 1:
92                     ins = etree.Element('br')
93                     ins.tail = chunks.pop()
94                     elem.insert(0, ins)
95                 elem.text = chunks.pop(0)
96
97     def parts(self):
98         if self.provider is None:
99             raise NoProvider('No document provider supplied.')
100         if self.book_info is None:
101             raise NoDublinCore('No Dublin Core in document.')
102         for part_uri in self.book_info.parts:
103             yield self.from_file(self.provider.by_uri(part_uri),
104                     provider=self.provider)
105
106     def chunk(self, path):
107         # convert the path to XPath
108         expr = self.path_to_xpath(path)
109         elems = self.edoc.xpath(expr)
110
111         if len(elems) == 0:
112             return None
113         else:
114             return elems[0]
115
116     def path_to_xpath(self, path):
117         parts = []
118
119         for part in path.split('/'):
120             match = re.match(r'([^\[]+)\[(\d+)\]', part)
121             if not match:
122                 parts.append(part)
123             else:
124                 tag, n = match.groups()
125                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
126
127         if parts[0] == '.':
128             parts[0] = ''
129
130         return '/'.join(parts)
131
132     def transform(self, stylesheet, **options):
133         return self.edoc.xslt(stylesheet, **options)
134
135     def update_dc(self):
136         if self.book_info:
137             parent = self.rdf_elem.getparent()
138             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
139
140     def serialize(self):
141         self.update_dc()
142         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
143
144     def merge_chunks(self, chunk_dict):
145         unmerged = []
146
147         for key, data in chunk_dict.iteritems():
148             try:
149                 xpath = self.path_to_xpath(key)
150                 node = self.edoc.xpath(xpath)[0]
151                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
152                 node.getparent().replace(node, repl)
153             except Exception, e:
154                 unmerged.append( repr( (key, xpath, e) ) )
155
156         return unmerged
157
158     def clean_ed_note(self):
159         """ deletes forbidden tags from nota_red """
160
161         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
162                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
163             tail = node.tail
164             node.clear()
165             node.tag = 'span'
166             node.tail = tail
167
168     def editors(self):
169         """Returns a set of all editors for book and its children.
170
171         :returns: set of dcparser.Person objects
172         """
173         if self.book_info is None:
174             raise NoDublinCore('No Dublin Core in document.')
175         persons = set(self.book_info.editors +
176                         self.book_info.technical_editors)
177         for child in self.parts():
178             persons.update(child.editors())
179         if None in persons:
180             persons.remove(None)
181         return persons
182
183     # Converters
184
185     def as_html(self, *args, **kwargs):
186         from librarian import html
187         return html.transform(self, *args, **kwargs)
188
189     def as_text(self, *args, **kwargs):
190         from librarian import text
191         return text.transform(self, *args, **kwargs)
192
193     def as_epub(self, *args, **kwargs):
194         from librarian import epub
195         return epub.transform(self, *args, **kwargs)
196
197     def as_pdf(self, *args, **kwargs):
198         from librarian import pdf
199         return pdf.transform(self, *args, **kwargs)
200
201     def as_mobi(self, *args, **kwargs):
202         from librarian import mobi
203         return mobi.transform(self, *args, **kwargs)
204
205     def as_fb2(self, *args, **kwargs):
206         from librarian import fb2
207         return fb2.transform(self, *args, **kwargs)
208
209     def as_cover(self, cover_class=None, *args, **kwargs):
210         if cover_class is None:
211             cover_class = DefaultEbookCover
212         return cover_class(self.book_info, *args, **kwargs).output_file()
213
214     def save_output_file(self, output_file, output_path=None,
215             output_dir_path=None, make_author_dir=False, ext=None):
216         if output_dir_path:
217             save_path = output_dir_path
218             if make_author_dir:
219                 save_path = os.path.join(save_path,
220                         unicode(self.book_info.author).encode('utf-8'))
221             save_path = os.path.join(save_path,
222                                 self.book_info.uri.slug)
223             if ext:
224                 save_path += '.%s' % ext
225         else:
226             save_path = output_path
227
228         output_file.save_as(save_path)