Accept dates in like "2 poł. XIX w."
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 from librarian import ValidationError, NoDublinCore,  ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
9
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
13
14 import os
15 import re
16 from StringIO import StringIO
17
18 class WLDocument(object):
19     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
20     provider = None
21
22     def __init__(self, edoc, parse_dublincore=True, provider=None):
23         self.edoc = edoc
24         self.provider = provider
25
26         root_elem = edoc.getroot()
27
28         dc_path = './/' + RDFNS('RDF')
29
30         if root_elem.tag != 'utwor':
31             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
32
33         if parse_dublincore:
34             self.rdf_elem = root_elem.find(dc_path)
35
36             if self.rdf_elem is None:
37                 raise NoDublinCore('Document has no DublinCore - which is required.')
38
39             self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
40         else:
41             self.book_info = None
42
43     @classmethod
44     def from_string(cls, xml, *args, **kwargs):
45         return cls.from_file(StringIO(xml), *args, **kwargs)
46
47     @classmethod
48     def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
49
50         # first, prepare for parsing
51         if isinstance(xmlfile, basestring):
52             file = open(xmlfile, 'rb')
53             try:
54                 data = file.read()
55             finally:
56                 file.close()
57         else:
58             data = xmlfile.read()
59
60         if not isinstance(data, unicode):
61             data = data.decode('utf-8')
62
63         data = data.replace(u'\ufeff', '')
64
65         try:
66             parser = etree.XMLParser(remove_blank_text=False)
67             tree = etree.parse(StringIO(data.encode('utf-8')), parser)
68
69             return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
70         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
71             raise ParseError(e)
72
73     def swap_endlines(self):
74         """Converts line breaks in stanzas into <br/> tags."""
75         # only swap inside stanzas
76         for elem in self.edoc.iter('strofa'):
77             for child in list(elem):
78                 if child.tail:
79                     chunks = self.LINE_SWAP_EXPR.split(child.tail)
80                     ins_index = elem.index(child) + 1
81                     while len(chunks) > 1:
82                         ins = etree.Element('br')
83                         ins.tail = chunks.pop()
84                         elem.insert(ins_index, ins)
85                     child.tail = chunks.pop(0)
86             if elem.text:
87                 chunks = self.LINE_SWAP_EXPR.split(elem.text)
88                 while len(chunks) > 1:
89                     ins = etree.Element('br')
90                     ins.tail = chunks.pop()
91                     elem.insert(0, ins)
92                 elem.text = chunks.pop(0)
93
94     def parts(self):
95         if self.provider is None:
96             raise NoProvider('No document provider supplied.')
97         if self.book_info is None:
98             raise NoDublinCore('No Dublin Core in document.')
99         for part_uri in self.book_info.parts:
100             yield self.from_file(self.provider.by_uri(part_uri),
101                     provider=self.provider)
102
103     def chunk(self, path):
104         # convert the path to XPath
105         expr = self.path_to_xpath(path)
106         elems = self.edoc.xpath(expr)
107
108         if len(elems) == 0:
109             return None
110         else:
111             return elems[0]
112
113     def path_to_xpath(self, path):
114         parts = []
115
116         for part in path.split('/'):
117             match = re.match(r'([^\[]+)\[(\d+)\]', part)
118             if not match:
119                 parts.append(part)
120             else:
121                 tag, n = match.groups()
122                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
123
124         if parts[0] == '.':
125             parts[0] = ''
126
127         return '/'.join(parts)
128
129     def transform(self, stylesheet, **options):
130         return self.edoc.xslt(stylesheet, **options)
131
132     def update_dc(self):
133         if self.book_info:
134             parent = self.rdf_elem.getparent()
135             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
136
137     def serialize(self):
138         self.update_dc()
139         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
140
141     def merge_chunks(self, chunk_dict):
142         unmerged = []
143
144         for key, data in chunk_dict.iteritems():
145             try:
146                 xpath = self.path_to_xpath(key)
147                 node = self.edoc.xpath(xpath)[0]
148                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
149                 node.getparent().replace(node, repl);
150             except Exception, e:
151                 unmerged.append( repr( (key, xpath, e) ) )
152
153         return unmerged
154
155     def clean_ed_note(self):
156         """ deletes forbidden tags from nota_red """
157
158         for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
159                     ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
160             tail = node.tail
161             node.clear()
162             node.tag = 'span'
163             node.tail = tail
164
165     # Converters
166
167     def as_html(self, *args, **kwargs):
168         from librarian import html
169         return html.transform(self, *args, **kwargs)
170
171     def as_text(self, *args, **kwargs):
172         from librarian import text
173         return text.transform(self, *args, **kwargs)
174
175     def as_epub(self, *args, **kwargs):
176         from librarian import epub
177         return epub.transform(self, *args, **kwargs)
178
179     def as_pdf(self, *args, **kwargs):
180         from librarian import pdf
181         return pdf.transform(self, *args, **kwargs)
182
183     def as_mobi(self, *args, **kwargs):
184         from librarian import mobi
185         return mobi.transform(self, *args, **kwargs)
186
187     def save_output_file(self, output_file, output_path=None,
188             output_dir_path=None, make_author_dir=False, ext=None):
189         if output_dir_path:
190             save_path = output_dir_path
191             if make_author_dir:
192                 save_path = os.path.join(save_path,
193                         unicode(self.book_info.author).encode('utf-8'))
194             save_path = os.path.join(save_path,
195                                 self.book_info.uri.filename_stem())
196             if ext:
197                 save_path += '.%s' % ext
198         else:
199             save_path = output_path
200
201         output_file.save_as(save_path)