fix conjunctions when not hyphenating
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian.cover import DefaultEbookCover
9 from librarian import dcparser
10
11 from xml.parsers.expat import ExpatError
12 from lxml import etree
13 from lxml.etree import XMLSyntaxError, XSLTApplyError
14
15 import os
16 import re
17 from StringIO import StringIO
18
19
20 class WLDocument(object):
21     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
22     provider = None
23
24     def __init__(self, edoc, parse_dublincore=True, provider=None, 
25                  strict=False, meta_fallbacks=None):
26         self.edoc = edoc
27         self.provider = provider
28
29         root_elem = edoc.getroot()
30
31         dc_path = './/' + RDFNS('RDF')
32
33         if root_elem.tag != 'utwor':
34             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
35
36         if parse_dublincore:
37             self.rdf_elem = root_elem.find(dc_path)
38
39             if self.rdf_elem is None:
40                 raise NoDublinCore('Document has no DublinCore - which is required.')
41
42             self.book_info = dcparser.BookInfo.from_element(
43                     self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
44         else:
45             self.book_info = None
46
47     @classmethod
48     def from_string(cls, xml, *args, **kwargs):
49         return cls.from_file(StringIO(xml), *args, **kwargs)
50
51     @classmethod
52     def from_file(cls, xmlfile, *args, **kwargs):
53
54         # first, prepare for parsing
55         if isinstance(xmlfile, basestring):
56             file = open(xmlfile, 'rb')
57             try:
58                 data = file.read()
59             finally:
60                 file.close()
61         else:
62             data = xmlfile.read()
63
64         if not isinstance(data, unicode):
65             data = data.decode('utf-8')
66
67         data = data.replace(u'\ufeff', '')
68
69         try:
70             parser = etree.XMLParser(remove_blank_text=False)
71             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
72
73             return cls(tree, *args, **kwargs)
74         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
75             raise ParseError(e)
76
77     def swap_endlines(self):
78         """Converts line breaks in stanzas into <br/> tags."""
79         # only swap inside stanzas
80         for elem in self.edoc.iter('strofa'):
81             for child in list(elem):
82                 if child.tail:
83                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
84                     ins_index = elem.index(child) + 1
85                     while len(chunks) > 1:
86                         ins = etree.Element('br')
87                         ins.tail = chunks.pop()
88                         elem.insert(ins_index, ins)
89                     child.tail = chunks.pop(0)
90             if elem.text:
91                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
92                 while len(chunks) > 1:
93                     ins = etree.Element('br')
94                     ins.tail = chunks.pop()
95                     elem.insert(0, ins)
96                 elem.text = chunks.pop(0)
97
98     def parts(self):
99         if self.provider is None:
100             raise NoProvider('No document provider supplied.')
101         if self.book_info is None:
102             raise NoDublinCore('No Dublin Core in document.')
103         for part_uri in self.book_info.parts:
104             yield self.from_file(self.provider.by_uri(part_uri), provider=self.provider)
105
106     def chunk(self, path):
107         # convert the path to XPath
108         expr = self.path_to_xpath(path)
109         elems = self.edoc.xpath(expr)
110
111         if len(elems) == 0:
112             return None
113         else:
114             return elems[0]
115
116     def path_to_xpath(self, path):
117         parts = []
118
119         for part in path.split('/'):
120             match = re.match(r'([^\[]+)\[(\d+)\]', part)
121             if not match:
122                 parts.append(part)
123             else:
124                 tag, n = match.groups()
125                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag))
126
127         if parts[0] == '.':
128             parts[0] = ''
129
130         return '/'.join(parts)
131
132     def transform(self, stylesheet, **options):
133         return self.edoc.xslt(stylesheet, **options)
134
135     def update_dc(self):
136         if self.book_info:
137             parent = self.rdf_elem.getparent()
138             parent.replace(self.rdf_elem, self.book_info.to_etree(parent))
139
140     def serialize(self):
141         self.update_dc()
142         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
143
144     def merge_chunks(self, chunk_dict):
145         unmerged = []
146
147         for key, data in chunk_dict.iteritems():
148             try:
149                 xpath = self.path_to_xpath(key)
150                 node = self.edoc.xpath(xpath)[0]
151                 repl = etree.fromstring(u"<%s>%s</%s>" % (node.tag, data, node.tag))
152                 node.getparent().replace(node, repl)
153             except Exception, e:
154                 unmerged.append(repr((key, xpath, e)))
155
156         return unmerged
157
158     def clean_ed_note(self, note_tag='nota_red'):
159         """ deletes forbidden tags from nota_red """
160
161         for node in self.edoc.xpath('|'.join('//%s//%s' % (note_tag, tag) for tag in
162                                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
163             tail = node.tail
164             node.clear()
165             node.tag = 'span'
166             node.tail = tail
167
168     def editors(self):
169         """Returns a set of all editors for book and its children.
170
171         :returns: set of dcparser.Person objects
172         """
173         if self.book_info is None:
174             raise NoDublinCore('No Dublin Core in document.')
175         persons = set(self.book_info.editors + self.book_info.technical_editors)
176         for child in self.parts():
177             persons.update(child.editors())
178         if None in persons:
179             persons.remove(None)
180         return persons
181
182     # Converters
183
184     def as_html(self, *args, **kwargs):
185         from librarian import html
186         return html.transform(self, *args, **kwargs)
187
188     def as_text(self, *args, **kwargs):
189         from librarian import text
190         return text.transform(self, *args, **kwargs)
191
192     def as_epub(self, *args, **kwargs):
193         from librarian import epub
194         return epub.transform(self, *args, **kwargs)
195
196     def as_pdf(self, *args, **kwargs):
197         from librarian import pdf
198         return pdf.transform(self, *args, **kwargs)
199
200     def as_mobi(self, *args, **kwargs):
201         from librarian import mobi
202         return mobi.transform(self, *args, **kwargs)
203
204     def as_fb2(self, *args, **kwargs):
205         from librarian import fb2
206         return fb2.transform(self, *args, **kwargs)
207
208     def as_cover(self, cover_class=None, *args, **kwargs):
209         if cover_class is None:
210             cover_class = DefaultEbookCover
211         return cover_class(self.book_info, *args, **kwargs).output_file()
212
213     def save_output_file(self, output_file, output_path=None, output_dir_path=None, make_author_dir=False, ext=None):
214         if output_dir_path:
215             save_path = output_dir_path
216             if make_author_dir:
217                 save_path = os.path.join(save_path, unicode(self.book_info.author).encode('utf-8'))
218             save_path = os.path.join(save_path, self.book_info.uri.slug)
219             if ext:
220                 save_path += '.%s' % ext
221         else:
222             save_path = output_path
223
224         output_file.save_as(save_path)