Separate the general from the WL-specific: PDF
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS, IOFile
8 from librarian.styles.wolnelektury.cover import WLCover
9 from librarian import dcparser
10
11 from xml.parsers.expat import ExpatError
12 from lxml import etree
13 from lxml.etree import XMLSyntaxError, XSLTApplyError
14
15 import os
16 import re
17 from StringIO import StringIO
18
19 class WLDocument(object):
20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
21     provider = None
22
23     _edoc = None
24     @property
25     def edoc(self):
26         if self._edoc is None:
27             data = self.source.get_string()
28             if not isinstance(data, unicode):
29                 data = data.decode('utf-8')
30             data = data.replace(u'\ufeff', '')
31             try:
32                 parser = etree.XMLParser(remove_blank_text=False)
33                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
34             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
35                 raise ParseError(e)
36         return self._edoc
37
38     _rdf_elem = None
39     @property
40     def rdf_elem(self):
41         if self._rdf_elem is None:
42             dc_path = './/' + RDFNS('RDF')
43             self._rdf_elem = self.edoc.getroot().find(dc_path)
44             if self._rdf_elem is None:
45                 raise NoDublinCore('Document has no DublinCore - which is required.')
46         return self._rdf_elem
47
48     _book_info = None
49     @property
50     def book_info(self):
51         if not self.parse_dublincore:
52             return None
53         if self._book_info is None:
54             self._book_info = dcparser.BookInfo.from_element(
55                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
56         return self._book_info
57
58     def __init__(self, iofile, provider=None, 
59             parse_dublincore=True, # shouldn't it be in a subclass?
60             strict=False, # ?
61             meta_fallbacks=None # ?
62             ):
63         self.source = iofile
64         self.provider = provider
65         self.parse_dublincore = parse_dublincore
66         self.strict = strict
67         self.meta_fallbacks = meta_fallbacks
68         if self.edoc.getroot().tag != 'utwor':
69             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
70         if parse_dublincore:
71             self.book_info
72
73     @classmethod
74     def from_string(cls, xml, *args, **kwargs):
75         return cls(IOFile.from_string(xml), *args, **kwargs)
76
77     @classmethod
78     def from_file(cls, xmlfile, *args, **kwargs):
79         if isinstance(xmlfile, basestring):
80             iofile = IOFile.from_filename(xmlfile)
81         else:
82             iofile = IOFile.from_file(xmlfile)
83         return cls(iofile, *args, **kwargs)
84
85
86     def swap_endlines(self):
87         """Converts line breaks in stanzas into <br/> tags."""
88         # only swap inside stanzas
89         for elem in self.edoc.iter('strofa'):
90             for child in list(elem):
91                 if child.tail:
92                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
93                     ins_index = elem.index(child) + 1
94                     while len(chunks) > 1:
95                         ins = etree.Element('br')
96                         ins.tail = chunks.pop()
97                         elem.insert(ins_index, ins)
98                     child.tail = chunks.pop(0)
99             if elem.text:
100                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
101                 while len(chunks) > 1:
102                     ins = etree.Element('br')
103                     ins.tail = chunks.pop()
104                     elem.insert(0, ins)
105                 elem.text = chunks.pop(0)
106
107     def parts(self):
108         if self.book_info is None:
109             raise NoDublinCore('No Dublin Core in document.')
110         if self.book_info.parts and self.provider is None:
111             raise NoProvider('No document provider supplied.')
112         for part_uri in self.book_info.parts:
113             yield self.from_file(self.provider.by_uri(part_uri),
114                     provider=self.provider)
115
116     def chunk(self, path):
117         # convert the path to XPath
118         expr = self.path_to_xpath(path)
119         elems = self.edoc.xpath(expr)
120
121         if len(elems) == 0:
122             return None
123         else:
124             return elems[0]
125
126     def path_to_xpath(self, path):
127         parts = []
128
129         for part in path.split('/'):
130             match = re.match(r'([^\[]+)\[(\d+)\]', part)
131             if not match:
132                 parts.append(part)
133             else:
134                 tag, n = match.groups()
135                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
136
137         if parts[0] == '.':
138             parts[0] = ''
139
140         return '/'.join(parts)
141
142     def transform(self, stylesheet, **options):
143         return self.edoc.xslt(stylesheet, **options)
144
145     def update_dc(self):
146         if self.book_info:
147             parent = self.rdf_elem.getparent()
148             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
149
150     def serialize(self):
151         self.update_dc()
152         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
153
154     def merge_chunks(self, chunk_dict):
155         unmerged = []
156
157         for key, data in chunk_dict.iteritems():
158             try:
159                 xpath = self.path_to_xpath(key)
160                 node = self.edoc.xpath(xpath)[0]
161                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
162                 node.getparent().replace(node, repl)
163             except Exception, e:
164                 unmerged.append( repr( (key, xpath, e) ) )
165
166         return unmerged
167
168     def clean_ed_note(self):
169         """ deletes forbidden tags from nota_red """
170
171         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
172                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
173             tail = node.tail
174             node.clear()
175             node.tag = 'span'
176             node.tail = tail
177
178     def editors(self):
179         """Returns a set of all editors for book and its children.
180
181         :returns: set of dcparser.Person objects
182         """
183         if self.book_info is None:
184             raise NoDublinCore('No Dublin Core in document.')
185         persons = set(self.book_info.editors +
186                         self.book_info.technical_editors)
187         for child in self.parts():
188             persons.update(child.editors())
189         if None in persons:
190             persons.remove(None)
191         return persons
192
193     # Converters
194
195     def as_html(self, *args, **kwargs):
196         from librarian import html
197         return html.transform(self, *args, **kwargs)
198
199     def as_text(self, *args, **kwargs):
200         from librarian import text
201         return text.transform(self, *args, **kwargs)
202
203     def as_epub(self, *args, **kwargs):
204         from librarian import epub
205         return epub.transform(self, *args, **kwargs)
206
207     def as_pdf(self, *args, **kwargs):
208         from librarian import pdf
209         return pdf.transform(self, *args, **kwargs)
210
211     def as_mobi(self, *args, **kwargs):
212         from librarian import mobi
213         return mobi.transform(self, *args, **kwargs)
214
215     def as_fb2(self, *args, **kwargs):
216         from librarian import fb2
217         return fb2.transform(self, *args, **kwargs)
218
219     def as_cover(self, cover_class=None, *args, **kwargs):
220         if cover_class is None:
221             cover_class = WLCover
222         return cover_class(self.book_info, *args, **kwargs).output_file()
223
224     def save_output_file(self, output_file, output_path=None,
225             output_dir_path=None, make_author_dir=False, ext=None):
226         if output_dir_path:
227             save_path = output_dir_path
228             if make_author_dir:
229                 save_path = os.path.join(save_path,
230                         unicode(self.book_info.author).encode('utf-8'))
231             save_path = os.path.join(save_path,
232                                 self.book_info.uri.slug)
233             if ext:
234                 save_path += '.%s' % ext
235         else:
236             save_path = output_path
237
238         output_file.save_as(save_path)