turns out properties are not callable
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError
7 from librarian import RDFNS, IOFile
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18
19 class WLDocument(object):
20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
21     provider = None
22
23     _edoc = None
24
25     @property
26     def edoc(self):
27         if self._edoc is None:
28             data = self.source.get_string()
29             if not isinstance(data, unicode):
30                 data = data.decode('utf-8')
31             data = data.replace(u'\ufeff', '')
32             try:
33                 parser = etree.XMLParser()
34                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
35             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
36                 raise ParseError(e)
37         return self._edoc
38
39     _rdf_elem = None
40
41     @property
42     def rdf_elem(self):
43         if self._rdf_elem is None:
44             dc_path = './/' + RDFNS('RDF')
45             self._rdf_elem = self.edoc.getroot().find(dc_path)
46             if self._rdf_elem is None:
47                 raise NoDublinCore('Document has no DublinCore - which is required.')
48         return self._rdf_elem
49
50     _book_info = None
51
52     @property
53     def book_info(self):
54         if not self.parse_dublincore:
55             return None
56         if self._book_info is None:
57             self._book_info = dcparser.BookInfo.from_element(
58                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
59         return self._book_info
60
61     def __init__(self, iofile, provider=None, parse_dublincore=True,  # shouldn't it be in a subclass?
62                  strict=False,  # ?
63                  meta_fallbacks=None):  # ?
64         self.source = iofile
65         self.provider = provider
66         self.parse_dublincore = parse_dublincore
67         self.strict = strict
68         self.meta_fallbacks = meta_fallbacks
69         root_elem = self.edoc.getroot()
70         if root_elem.tag != 'utwor':
71             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
72         if parse_dublincore:
73             self.book_info
74
75     @classmethod
76     def from_string(cls, xml, *args, **kwargs):
77         return cls(IOFile.from_string(xml), *args, **kwargs)
78
79     @classmethod
80     def from_file(cls, xmlfile, *args, **kwargs):
81         iofile = IOFile.from_filename(xmlfile)
82         return cls(iofile, *args, **kwargs)
83
84     def swap_endlines(self):
85         """Converts line breaks in stanzas into <br/> tags."""
86         # only swap inside stanzas
87         for elem in self.edoc.iter('strofa'):
88             for child in list(elem):
89                 if child.tail:
90                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
91                     ins_index = elem.index(child) + 1
92                     while len(chunks) > 1:
93                         ins = etree.Element('br')
94                         ins.tail = chunks.pop()
95                         elem.insert(ins_index, ins)
96                     child.tail = chunks.pop(0)
97             if elem.text:
98                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
99                 while len(chunks) > 1:
100                     ins = etree.Element('br')
101                     ins.tail = chunks.pop()
102                     elem.insert(0, ins)
103                 elem.text = chunks.pop(0)
104
105     def chunk(self, path):
106         # convert the path to XPath
107         expr = self.path_to_xpath(path)
108         elems = self.edoc.xpath(expr)
109
110         if len(elems) == 0:
111             return None
112         else:
113             return elems[0]
114
115     def path_to_xpath(self, path):
116         parts = []
117
118         for part in path.split('/'):
119             match = re.match(r'([^\[]+)\[(\d+)\]', part)
120             if not match:
121                 parts.append(part)
122             else:
123                 tag, n = match.groups()
124                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
125
126         if parts[0] == '.':
127             parts[0] = ''
128
129         return '/'.join(parts)
130
131     def transform(self, stylesheet, **options):
132         return self.edoc.xslt(stylesheet, **options)
133
134     def update_dc(self):
135         if self.book_info:
136             parent = self.rdf_elem.getparent()
137             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
138
139     def serialize(self):
140         self.update_dc()
141         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
142
143     def merge_chunks(self, chunk_dict):
144         unmerged = []
145
146         for key, data in chunk_dict.iteritems():
147             try:
148                 xpath = self.path_to_xpath(key)
149                 node = self.edoc.xpath(xpath)[0]
150                 repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
151                 node.getparent().replace(node, repl)
152             except Exception, e:
153                 # WTF xpath may be unused; also: too broad except
154                 unmerged.append(repr((key, xpath, e)))
155
156         return unmerged
157
158     def clean_ed_note(self):
159         """ deletes forbidden tags from nota_red """
160
161         forbidden_tags = ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw')
162         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in forbidden_tags)):
163             tail = node.tail
164             node.clear()
165             node.tag = 'span'
166             node.tail = tail
167
168     # Converters
169
170     def as_html(self, *args, **kwargs):
171         from librarian import pyhtml as html
172         return html.transform(self, *args, **kwargs)
173
174     def as_text(self, *args, **kwargs):
175         from librarian import text
176         return text.transform(self, *args, **kwargs)
177
178     def as_epub(self, *args, **kwargs):
179         from librarian import epub
180         return epub.transform(self, *args, **kwargs)
181
182     def as_pdf(self, *args, **kwargs):
183         from librarian import pypdf
184         return pypdf.EduModulePDFFormat(self).build(*args, **kwargs)
185
186     def as_mobi(self, *args, **kwargs):
187         from librarian import mobi
188         return mobi.transform(self, *args, **kwargs)
189
190     def as_fb2(self, *args, **kwargs):
191         from librarian import fb2
192         return fb2.transform(self, *args, **kwargs)
193
194     def as_cover(self, cover_class=None, *args, **kwargs):
195         if cover_class is None:
196             from librarian.styles.wolnelektury.cover import WLCover
197             cover_class = WLCover
198         return cover_class(self.book_info, *args, **kwargs).output_file()
199
200     def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
201         if output_dir_path:
202             save_path = output_dir_path
203             if make_author_dir:
204                 save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8'))
205             save_path = os.path.join(save_path, self.book_info.uri.slug)
206             if ext:
207                 save_path += '.%s' % ext
208         else:
209             save_path = output_path
210
211         output_file.save_as(save_path)