#2044: handle weird whitespace for covers
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     def __init__(self, edoc, parse_dublincore=True, provider=None, strict=False):
23         self.edoc = edoc
24         self.provider = provider
25
26         root_elem = edoc.getroot()
27
28         dc_path = './/' + RDFNS('RDF')
29
30         if root_elem.tag != 'utwor':
31             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
32
33         if parse_dublincore:
34             self.rdf_elem = root_elem.find(dc_path)
35
36             if self.rdf_elem is None:
37                 raise NoDublinCore('Document has no DublinCore - which is required.')
38
39             self.book_info = dcparser.BookInfo.from_element(
40                     self.rdf_elem, strict=strict)
41         else:
42             self.book_info = None
43
44     @classmethod
45     def from_string(cls, xml, *args, **kwargs):
46         return cls.from_file(StringIO(xml), *args, **kwargs)
47
48     @classmethod
49     def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
50
51         # first, prepare for parsing
52         if isinstance(xmlfile, basestring):
53             file = open(xmlfile, 'rb')
54             try:
55                 data = file.read()
56             finally:
57                 file.close()
58         else:
59             data = xmlfile.read()
60
61         if not isinstance(data, unicode):
62             data = data.decode('utf-8')
63
64         data = data.replace(u'\ufeff', '')
65
66         try:
67             parser = etree.XMLParser(remove_blank_text=False)
68             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
69
70             return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
71         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
72             raise ParseError(e)
73
74     def swap_endlines(self):
75         """Converts line breaks in stanzas into <br/> tags."""
76         # only swap inside stanzas
77         for elem in self.edoc.iter('strofa'):
78             for child in list(elem):
79                 if child.tail:
80                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
81                     ins_index = elem.index(child) + 1
82                     while len(chunks) > 1:
83                         ins = etree.Element('br')
84                         ins.tail = chunks.pop()
85                         elem.insert(ins_index, ins)
86                     child.tail = chunks.pop(0)
87             if elem.text:
88                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
89                 while len(chunks) > 1:
90                     ins = etree.Element('br')
91                     ins.tail = chunks.pop()
92                     elem.insert(0, ins)
93                 elem.text = chunks.pop(0)
94
95     def parts(self):
96         if self.provider is None:
97             raise NoProvider('No document provider supplied.')
98         if self.book_info is None:
99             raise NoDublinCore('No Dublin Core in document.')
100         for part_uri in self.book_info.parts:
101             yield self.from_file(self.provider.by_uri(part_uri),
102                     provider=self.provider)
103
104     def chunk(self, path):
105         # convert the path to XPath
106         expr = self.path_to_xpath(path)
107         elems = self.edoc.xpath(expr)
108
109         if len(elems) == 0:
110             return None
111         else:
112             return elems[0]
113
114     def path_to_xpath(self, path):
115         parts = []
116
117         for part in path.split('/'):
118             match = re.match(r'([^\[]+)\[(\d+)\]', part)
119             if not match:
120                 parts.append(part)
121             else:
122                 tag, n = match.groups()
123                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
124
125         if parts[0] == '.':
126             parts[0] = ''
127
128         return '/'.join(parts)
129
130     def transform(self, stylesheet, **options):
131         return self.edoc.xslt(stylesheet, **options)
132
133     def update_dc(self):
134         if self.book_info:
135             parent = self.rdf_elem.getparent()
136             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
137
138     def serialize(self):
139         self.update_dc()
140         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
141
142     def merge_chunks(self, chunk_dict):
143         unmerged = []
144
145         for key, data in chunk_dict.iteritems():
146             try:
147                 xpath = self.path_to_xpath(key)
148                 node = self.edoc.xpath(xpath)[0]
149                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
150                 node.getparent().replace(node, repl);
151             except Exception, e:
152                 unmerged.append( repr( (key, xpath, e) ) )
153
154         return unmerged
155
156     def clean_ed_note(self):
157         """ deletes forbidden tags from nota_red """
158
159         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
160                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
161             tail = node.tail
162             node.clear()
163             node.tag = 'span'
164             node.tail = tail
165
166     # Converters
167
168     def as_html(self, *args, **kwargs):
169         from librarian import html
170         return html.transform(self, *args, **kwargs)
171
172     def as_text(self, *args, **kwargs):
173         from librarian import text
174         return text.transform(self, *args, **kwargs)
175
176     def as_epub(self, *args, **kwargs):
177         from librarian import epub
178         return epub.transform(self, *args, **kwargs)
179
180     def as_pdf(self, *args, **kwargs):
181         from librarian import pdf
182         return pdf.transform(self, *args, **kwargs)
183
184     def as_mobi(self, *args, **kwargs):
185         from librarian import mobi
186         return mobi.transform(self, *args, **kwargs)
187
188     def save_output_file(self, output_file, output_path=None,
189             output_dir_path=None, make_author_dir=False, ext=None):
190         if output_dir_path:
191             save_path = output_dir_path
192             if make_author_dir:
193                 save_path = os.path.join(save_path,
194                         unicode(self.book_info.author).encode('utf-8'))
195             save_path = os.path.join(save_path,
196                                 self.book_info.uri.slug)
197             if ext:
198                 save_path += '.%s' % ext
199         else:
200             save_path = output_path
201
202         output_file.save_as(save_path)