ignore trailing spaces in dc, add curriculum fields
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS, IOFile
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     _edoc = None
23     @property
24     def edoc(self):
25         if self._edoc is None:
26             data = self.source.get_string()
27             if not isinstance(data, unicode):
28                 data = data.decode('utf-8')
29             data = data.replace(u'\ufeff', '')
30             try:
31                 parser = etree.XMLParser(remove_blank_text=False)
32                 self._edoc = etree.parse(StringIO(data.encode('utf-8')), parser)
33             except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
34                 raise ParseError(e)
35         return self._edoc
36
37     _rdf_elem = None
38     @property
39     def rdf_elem(self):
40         if self._rdf_elem is None:
41             dc_path = './/' + RDFNS('RDF')
42             self._rdf_elem = self.edoc.getroot().find(dc_path)
43             if self._rdf_elem is None:
44                 raise NoDublinCore('Document has no DublinCore - which is required.')
45         return self._rdf_elem
46
47     _book_info = None
48     @property
49     def book_info(self):
50         if not self.parse_dublincore:
51             return None
52         if self._book_info is None:
53             self._book_info = dcparser.BookInfo.from_element(
54                     self.rdf_elem, fallbacks=self.meta_fallbacks, strict=self.strict)
55         return self._book_info
56
57     def __init__(self, iofile, provider=None, 
58             parse_dublincore=True, # shouldn't it be in a subclass?
59             strict=False, # ?
60             meta_fallbacks=None # ?
61             ):
62         self.source = iofile
63         self.provider = provider
64         self.parse_dublincore = parse_dublincore
65         self.strict = strict
66         self.meta_fallbacks = meta_fallbacks
67         if self.edoc.getroot().tag != 'utwor':
68             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
69         if parse_dublincore:
70             self.book_info
71
72     @classmethod
73     def from_string(cls, xml, *args, **kwargs):
74         return cls(IOFile.from_string(xml), *args, **kwargs)
75
76     @classmethod
77     def from_file(cls, xmlfile, *args, **kwargs):
78         if isinstance(xmlfile, basestring):
79             iofile = IOFile.from_filename(xmlfile)
80         else:
81             iofile = IOFile.from_file(xmlfile)
82         return cls(iofile, *args, **kwargs)
83
84
85     def swap_endlines(self):
86         """Converts line breaks in stanzas into <br/> tags."""
87         # only swap inside stanzas
88         for elem in self.edoc.iter('strofa'):
89             for child in list(elem):
90                 if child.tail:
91                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
92                     ins_index = elem.index(child) + 1
93                     while len(chunks) > 1:
94                         ins = etree.Element('br')
95                         ins.tail = chunks.pop()
96                         elem.insert(ins_index, ins)
97                     child.tail = chunks.pop(0)
98             if elem.text:
99                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
100                 while len(chunks) > 1:
101                     ins = etree.Element('br')
102                     ins.tail = chunks.pop()
103                     elem.insert(0, ins)
104                 elem.text = chunks.pop(0)
105
106     def parts(self):
107         if self.book_info is None:
108             raise NoDublinCore('No Dublin Core in document.')
109         if self.book_info.parts and self.provider is None:
110             raise NoProvider('No document provider supplied.')
111         for part_uri in self.book_info.parts:
112             yield self.from_file(self.provider.by_uri(part_uri),
113                     provider=self.provider)
114
115     def chunk(self, path):
116         # convert the path to XPath
117         expr = self.path_to_xpath(path)
118         elems = self.edoc.xpath(expr)
119
120         if len(elems) == 0:
121             return None
122         else:
123             return elems[0]
124
125     def path_to_xpath(self, path):
126         parts = []
127
128         for part in path.split('/'):
129             match = re.match(r'([^\[]+)\[(\d+)\]', part)
130             if not match:
131                 parts.append(part)
132             else:
133                 tag, n = match.groups()
134                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
135
136         if parts[0] == '.':
137             parts[0] = ''
138
139         return '/'.join(parts)
140
141     def transform(self, stylesheet, **options):
142         return self.edoc.xslt(stylesheet, **options)
143
144     def update_dc(self):
145         if self.book_info:
146             parent = self.rdf_elem.getparent()
147             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
148
149     def serialize(self):
150         self.update_dc()
151         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
152
153     def merge_chunks(self, chunk_dict):
154         unmerged = []
155
156         for key, data in chunk_dict.iteritems():
157             try:
158                 xpath = self.path_to_xpath(key)
159                 node = self.edoc.xpath(xpath)[0]
160                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
161                 node.getparent().replace(node, repl)
162             except Exception, e:
163                 unmerged.append( repr( (key, xpath, e) ) )
164
165         return unmerged
166
167     def clean_ed_note(self):
168         """ deletes forbidden tags from nota_red """
169
170         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
171                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
172             tail = node.tail
173             node.clear()
174             node.tag = 'span'
175             node.tail = tail
176
177     def editors(self):
178         """Returns a set of all editors for book and its children.
179
180         :returns: set of dcparser.Person objects
181         """
182         if self.book_info is None:
183             raise NoDublinCore('No Dublin Core in document.')
184         persons = set(self.book_info.editors +
185                         self.book_info.technical_editors)
186         for child in self.parts():
187             persons.update(child.editors())
188         if None in persons:
189             persons.remove(None)
190         return persons
191
192     # Converters
193
194     def as_html(self, *args, **kwargs):
195         from librarian import pyhtml as html
196         return html.transform(self, *args, **kwargs)
197
198     def as_text(self, *args, **kwargs):
199         from librarian import text
200         return text.transform(self, *args, **kwargs)
201
202     def as_epub(self, *args, **kwargs):
203         from librarian import epub
204         return epub.transform(self, *args, **kwargs)
205
206     def as_pdf(self, *args, **kwargs):
207         from librarian import pdf
208         return pdf.transform(self, *args, **kwargs)
209
210     def as_mobi(self, *args, **kwargs):
211         from librarian import mobi
212         return mobi.transform(self, *args, **kwargs)
213
214     def as_fb2(self, *args, **kwargs):
215         from librarian import fb2
216         return fb2.transform(self, *args, **kwargs)
217
218     def as_cover(self, cover_class=None, *args, **kwargs):
219         if cover_class is None:
220             from librarian.cover import WLCover
221             cover_class = WLCover
222         return cover_class(self.book_info, *args, **kwargs).output_file()
223
224     def save_output_file(self, output_file, output_path=None,
225             output_dir_path=None, make_author_dir=False, ext=None):
226         if output_dir_path:
227             save_path = output_dir_path
228             if make_author_dir:
229                 save_path = os.path.join(save_path,
230                         unicode(self.book_info.author).encode('utf-8'))
231             save_path = os.path.join(save_path,
232                                 self.book_info.uri.slug)
233             if ext:
234                 save_path += '.%s' % ext
235         else:
236             save_path = output_path
237
238         output_file.save_as(save_path)