9300aa6c43ac04bccd3668e32a710a239b24db1c
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS, IOFile
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     _edoc = None
23     @property
24     def edoc(self):
25         if self._edoc is None:
26             data = self.source.get_string()
27             if not isinstance(data, unicode):
28                 data = data.decode('utf-8')
29             data = data.replace(u'\ufeff', '')
30             try:
31                 parser = etree.XMLParser(remove_blank_text=False)
32                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
33             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
34                 raise ParseError(e)
35         return self._edoc
36
37     _rdf_elem = None
38     @property
39     def rdf_elem(self):
40         if self._rdf_elem is None:
41             dc_path = './/' + RDFNS('RDF')
42             self._rdf_elem = self.edoc.getroot().find(dc_path)
43             if self._rdf_elem is None:
44                 raise NoDublinCore('Document has no DublinCore - which is required.')
45         return self._rdf_elem
46
47     _book_info = None
48     @property
49     def book_info(self):
50         if not self.parse_dublincore:
51             return None
52         if self._book_info is None:
53             self._book_info = dcparser.BookInfo.from_element(
54                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
55         return self._book_info
56
57     def __init__(self, iofile, provider=None, 
58             parse_dublincore=True, # shouldn't it be in a subclass?
59             strict=False, # ?
60             meta_fallbacks=None # ?
61             ):
62         self.source = iofile
63         self.provider = provider
64         self.parse_dublincore = parse_dublincore
65         self.strict = strict
66         self.meta_fallbacks = meta_fallbacks
67         if self.edoc.getroot().tag != 'utwor':
68             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
69         if parse_dublincore:
70             self.book_info
71
72     @classmethod
73     def from_string(cls, xml, *args, **kwargs):
74         return cls(IOFile.from_string(xml), *args, **kwargs)
75
76     @classmethod
77     def from_file(cls, xmlfile, *args, **kwargs):
78         iofile = IOFile.from_filename(xmlfile)
79         return cls(iofile, *args, **kwargs)
80
81
82     def swap_endlines(self):
83         """Converts line breaks in stanzas into <br/> tags."""
84         # only swap inside stanzas
85         for elem in self.edoc.iter('strofa'):
86             for child in list(elem):
87                 if child.tail:
88                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
89                     ins_index = elem.index(child) + 1
90                     while len(chunks) > 1:
91                         ins = etree.Element('br')
92                         ins.tail = chunks.pop()
93                         elem.insert(ins_index, ins)
94                     child.tail = chunks.pop(0)
95             if elem.text:
96                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
97                 while len(chunks) > 1:
98                     ins = etree.Element('br')
99                     ins.tail = chunks.pop()
100                     elem.insert(0, ins)
101                 elem.text = chunks.pop(0)
102
103     def chunk(self, path):
104         # convert the path to XPath
105         expr = self.path_to_xpath(path)
106         elems = self.edoc.xpath(expr)
107
108         if len(elems) == 0:
109             return None
110         else:
111             return elems[0]
112
113     def path_to_xpath(self, path):
114         parts = []
115
116         for part in path.split('/'):
117             match = re.match(r'([^\[]+)\[(\d+)\]', part)
118             if not match:
119                 parts.append(part)
120             else:
121                 tag, n = match.groups()
122                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
123
124         if parts[0] == '.':
125             parts[0] = ''
126
127         return '/'.join(parts)
128
129     def transform(self, stylesheet, **options):
130         return self.edoc.xslt(stylesheet, **options)
131
132     def update_dc(self):
133         if self.book_info:
134             parent = self.rdf_elem.getparent()
135             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
136
137     def serialize(self):
138         self.update_dc()
139         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
140
141     def merge_chunks(self, chunk_dict):
142         unmerged = []
143
144         for key, data in chunk_dict.iteritems():
145             try:
146                 xpath = self.path_to_xpath(key)
147                 node = self.edoc.xpath(xpath)[0]
148                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
149                 node.getparent().replace(node, repl)
150             except Exception, e:
151                 unmerged.append( repr( (key, xpath, e) ) )
152
153         return unmerged
154
155     def clean_ed_note(self):
156         """ deletes forbidden tags from nota_red """
157
158         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
159                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
160             tail = node.tail
161             node.clear()
162             node.tag = 'span'
163             node.tail = tail
164
165     # Converters
166
167     def as_html(self, *args, **kwargs):
168         from librarian import pyhtml as html
169         return html.transform(self, *args, **kwargs)
170
171     def as_text(self, *args, **kwargs):
172         from librarian import text
173         return text.transform(self, *args, **kwargs)
174
175     def as_epub(self, *args, **kwargs):
176         from librarian import epub
177         return epub.transform(self, *args, **kwargs)
178
179     def as_pdf(self, *args, **kwargs):
180         from librarian import pypdf
181         return pypdf.EduModulePDFFormat(self).build(*args, **kwargs)
182
183     def as_mobi(self, *args, **kwargs):
184         from librarian import mobi
185         return mobi.transform(self, *args, **kwargs)
186
187     def as_fb2(self, *args, **kwargs):
188         from librarian import fb2
189         return fb2.transform(self, *args, **kwargs)
190
191     def as_cover(self, cover_class=None, *args, **kwargs):
192         if cover_class is None:
193             from librarian.styles.wolnelektury.cover import WLCover
194             cover_class = WLCover
195         return cover_class(self.book_info, *args, **kwargs).output_file()
196
197     def save_output_file(self, output_file, output_path=None,
198             output_dir_path=None, make_author_dir=False, ext=None):
199         if output_dir_path:
200             save_path = output_dir_path
201             if make_author_dir:
202                 save_path = os.path.join(save_path,
203                         unicode(self.book_info.author).encode('utf-8'))
204             save_path = os.path.join(save_path,
205                                 self.book_info.uri.slug)
206             if ext:
207                 save_path += '.%s' % ext
208         else:
209             save_path = output_path
210
211         output_file.save_as(save_path)