fixes #2325: repetitions in editor list
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     def __init__(self, edoc, parse_dublincore=True, provider=None, strict=False):
23         self.edoc = edoc
24         self.provider = provider
25
26         root_elem = edoc.getroot()
27
28         dc_path = './/' + RDFNS('RDF')
29
30         if root_elem.tag != 'utwor':
31             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
32
33         if parse_dublincore:
34             self.rdf_elem = root_elem.find(dc_path)
35
36             if self.rdf_elem is None:
37                 raise NoDublinCore('Document has no DublinCore - which is required.')
38
39             self.book_info = dcparser.BookInfo.from_element(
40                     self.rdf_elem, strict=strict)
41         else:
42             self.book_info = None
43
44     @classmethod
45     def from_string(cls, xml, *args, **kwargs):
46         return cls.from_file(StringIO(xml), *args, **kwargs)
47
48     @classmethod
49     def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
50
51         # first, prepare for parsing
52         if isinstance(xmlfile, basestring):
53             file = open(xmlfile, 'rb')
54             try:
55                 data = file.read()
56             finally:
57                 file.close()
58         else:
59             data = xmlfile.read()
60
61         if not isinstance(data, unicode):
62             data = data.decode('utf-8')
63
64         data = data.replace(u'\ufeff', '')
65
66         try:
67             parser = etree.XMLParser(remove_blank_text=False)
68             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
69
70             return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
71         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
72             raise ParseError(e)
73
74     def swap_endlines(self):
75         """Converts line breaks in stanzas into <br/> tags."""
76         # only swap inside stanzas
77         for elem in self.edoc.iter('strofa'):
78             for child in list(elem):
79                 if child.tail:
80                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
81                     ins_index = elem.index(child) + 1
82                     while len(chunks) > 1:
83                         ins = etree.Element('br')
84                         ins.tail = chunks.pop()
85                         elem.insert(ins_index, ins)
86                     child.tail = chunks.pop(0)
87             if elem.text:
88                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
89                 while len(chunks) > 1:
90                     ins = etree.Element('br')
91                     ins.tail = chunks.pop()
92                     elem.insert(0, ins)
93                 elem.text = chunks.pop(0)
94
95     def parts(self):
96         if self.provider is None:
97             raise NoProvider('No document provider supplied.')
98         if self.book_info is None:
99             raise NoDublinCore('No Dublin Core in document.')
100         for part_uri in self.book_info.parts:
101             yield self.from_file(self.provider.by_uri(part_uri),
102                     provider=self.provider)
103
104     def chunk(self, path):
105         # convert the path to XPath
106         expr = self.path_to_xpath(path)
107         elems = self.edoc.xpath(expr)
108
109         if len(elems) == 0:
110             return None
111         else:
112             return elems[0]
113
114     def path_to_xpath(self, path):
115         parts = []
116
117         for part in path.split('/'):
118             match = re.match(r'([^\[]+)\[(\d+)\]', part)
119             if not match:
120                 parts.append(part)
121             else:
122                 tag, n = match.groups()
123                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
124
125         if parts[0] == '.':
126             parts[0] = ''
127
128         return '/'.join(parts)
129
130     def transform(self, stylesheet, **options):
131         return self.edoc.xslt(stylesheet, **options)
132
133     def update_dc(self):
134         if self.book_info:
135             parent = self.rdf_elem.getparent()
136             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
137
138     def serialize(self):
139         self.update_dc()
140         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
141
142     def merge_chunks(self, chunk_dict):
143         unmerged = []
144
145         for key, data in chunk_dict.iteritems():
146             try:
147                 xpath = self.path_to_xpath(key)
148                 node = self.edoc.xpath(xpath)[0]
149                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
150                 node.getparent().replace(node, repl)
151             except Exception, e:
152                 unmerged.append( repr( (key, xpath, e) ) )
153
154         return unmerged
155
156     def clean_ed_note(self):
157         """ deletes forbidden tags from nota_red """
158
159         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
160                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
161             tail = node.tail
162             node.clear()
163             node.tag = 'span'
164             node.tail = tail
165
166     def editors(self):
167         """Returns a set of all editors for book and its children.
168
169         :returns: set of dcparser.Person objects
170         """
171         if self.book_info is None:
172             raise NoDublinCore('No Dublin Core in document.')
173         persons = set(self.book_info.editors +
174                         self.book_info.technical_editors)
175         for child in self.parts():
176             persons.update(child.editors())
177         if None in persons:
178             persons.remove(None)
179         return persons
180
181     # Converters
182
183     def as_html(self, *args, **kwargs):
184         from librarian import html
185         return html.transform(self, *args, **kwargs)
186
187     def as_text(self, *args, **kwargs):
188         from librarian import text
189         return text.transform(self, *args, **kwargs)
190
191     def as_epub(self, *args, **kwargs):
192         from librarian import epub
193         return epub.transform(self, *args, **kwargs)
194
195     def as_pdf(self, *args, **kwargs):
196         from librarian import pdf
197         return pdf.transform(self, *args, **kwargs)
198
199     def as_mobi(self, *args, **kwargs):
200         from librarian import mobi
201         return mobi.transform(self, *args, **kwargs)
202
203     def as_fb2(self, *args, **kwargs):
204         from librarian import fb2
205         return fb2.transform(self, *args, **kwargs)
206
207     def save_output_file(self, output_file, output_path=None,
208             output_dir_path=None, make_author_dir=False, ext=None):
209         if output_dir_path:
210             save_path = output_dir_path
211             if make_author_dir:
212                 save_path = os.path.join(save_path,
213                         unicode(self.book_info.author).encode('utf-8'))
214             save_path = os.path.join(save_path,
215                                 self.book_info.uri.slug)
216             if ext:
217                 save_path += '.%s' % ext
218         else:
219             save_path = output_path
220
221         output_file.save_as(save_path)