e605dd983dc59bbb8539fdc15d66a3491b751463
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     def __init__(self, edoc, parse_dublincore=True, provider=None, 
23                     strict=False, meta_fallbacks=None):
24         self.edoc = edoc
25         self.provider = provider
26
27         root_elem = edoc.getroot()
28
29         dc_path = './/' + RDFNS('RDF')
30
31         if root_elem.tag != 'utwor':
32             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
33
34         if parse_dublincore:
35             self.rdf_elem = root_elem.find(dc_path)
36
37             if self.rdf_elem is None:
38                 raise NoDublinCore('Document has no DublinCore - which is required.')
39
40             self.book_info = dcparser.BookInfo.from_element(
41                     self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
42         else:
43             self.book_info = None
44
45     @classmethod
46     def from_string(cls, xml, *args, **kwargs):
47         return cls.from_file(StringIO(xml), *args, **kwargs)
48
49     @classmethod
50     def from_file(cls, xmlfile, *args, **kwargs):
51
52         # first, prepare for parsing
53         if isinstance(xmlfile, basestring):
54             file = open(xmlfile, 'rb')
55             try:
56                 data = file.read()
57             finally:
58                 file.close()
59         else:
60             data = xmlfile.read()
61
62         if not isinstance(data, unicode):
63             data = data.decode('utf-8')
64
65         data = data.replace(u'\ufeff', '')
66
67         try:
68             parser = etree.XMLParser(remove_blank_text=False)
69             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
70
71             return cls(tree, *args, **kwargs)
72         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
73             raise ParseError(e)
74
75     def swap_endlines(self):
76         """Converts line breaks in stanzas into <br/> tags."""
77         # only swap inside stanzas
78         for elem in self.edoc.iter('strofa'):
79             for child in list(elem):
80                 if child.tail:
81                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
82                     ins_index = elem.index(child) + 1
83                     while len(chunks) > 1:
84                         ins = etree.Element('br')
85                         ins.tail = chunks.pop()
86                         elem.insert(ins_index, ins)
87                     child.tail = chunks.pop(0)
88             if elem.text:
89                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
90                 while len(chunks) > 1:
91                     ins = etree.Element('br')
92                     ins.tail = chunks.pop()
93                     elem.insert(0, ins)
94                 elem.text = chunks.pop(0)
95
96     def parts(self):
97         if self.provider is None:
98             raise NoProvider('No document provider supplied.')
99         if self.book_info is None:
100             raise NoDublinCore('No Dublin Core in document.')
101         for part_uri in self.book_info.parts:
102             yield self.from_file(self.provider.by_uri(part_uri),
103                     provider=self.provider)
104
105     def chunk(self, path):
106         # convert the path to XPath
107         expr = self.path_to_xpath(path)
108         elems = self.edoc.xpath(expr)
109
110         if len(elems) == 0:
111             return None
112         else:
113             return elems[0]
114
115     def path_to_xpath(self, path):
116         parts = []
117
118         for part in path.split('/'):
119             match = re.match(r'([^\[]+)\[(\d+)\]', part)
120             if not match:
121                 parts.append(part)
122             else:
123                 tag, n = match.groups()
124                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
125
126         if parts[0] == '.':
127             parts[0] = ''
128
129         return '/'.join(parts)
130
131     def transform(self, stylesheet, **options):
132         return self.edoc.xslt(stylesheet, **options)
133
134     def update_dc(self):
135         if self.book_info:
136             parent = self.rdf_elem.getparent()
137             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
138
139     def serialize(self):
140         self.update_dc()
141         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
142
143     def merge_chunks(self, chunk_dict):
144         unmerged = []
145
146         for key, data in chunk_dict.iteritems():
147             try:
148                 xpath = self.path_to_xpath(key)
149                 node = self.edoc.xpath(xpath)[0]
150                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
151                 node.getparent().replace(node, repl)
152             except Exception, e:
153                 unmerged.append( repr( (key, xpath, e) ) )
154
155         return unmerged
156
157     def clean_ed_note(self):
158         """ deletes forbidden tags from nota_red """
159
160         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
161                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
162             tail = node.tail
163             node.clear()
164             node.tag = 'span'
165             node.tail = tail
166
167     def editors(self):
168         """Returns a set of all editors for book and its children.
169
170         :returns: set of dcparser.Person objects
171         """
172         if self.book_info is None:
173             raise NoDublinCore('No Dublin Core in document.')
174         persons = set(self.book_info.editors +
175                         self.book_info.technical_editors)
176         for child in self.parts():
177             persons.update(child.editors())
178         if None in persons:
179             persons.remove(None)
180         return persons
181
182     # Converters
183
184     def as_html(self, *args, **kwargs):
185         from librarian import html
186         return html.transform(self, *args, **kwargs)
187
188     def as_text(self, *args, **kwargs):
189         from librarian import text
190         return text.transform(self, *args, **kwargs)
191
192     def as_epub(self, *args, **kwargs):
193         from librarian import epub
194         return epub.transform(self, *args, **kwargs)
195
196     def as_pdf(self, *args, **kwargs):
197         from librarian import pdf
198         return pdf.transform(self, *args, **kwargs)
199
200     def as_mobi(self, *args, **kwargs):
201         from librarian import mobi
202         return mobi.transform(self, *args, **kwargs)
203
204     def as_fb2(self, *args, **kwargs):
205         from librarian import fb2
206         return fb2.transform(self, *args, **kwargs)
207
208     def save_output_file(self, output_file, output_path=None,
209             output_dir_path=None, make_author_dir=False, ext=None):
210         if output_dir_path:
211             save_path = output_dir_path
212             if make_author_dir:
213                 save_path = os.path.join(save_path,
214                         unicode(self.book_info.author).encode('utf-8'))
215             save_path = os.path.join(save_path,
216                                 self.book_info.uri.slug)
217             if ext:
218                 save_path += '.%s' % ext
219         else:
220             save_path = output_path
221
222         output_file.save_as(save_path)