1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 from StringIO import StringIO
18 class WLDocument(object):
19 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
22 def __init__(self, edoc, parse_dublincore=True, provider=None,
23 strict=False, meta_fallbacks=None):
25 self.provider = provider
27 root_elem = edoc.getroot()
29 dc_path = './/' + RDFNS('RDF')
31 if root_elem.tag != 'utwor':
32 raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
35 self.rdf_elem = root_elem.find(dc_path)
37 if self.rdf_elem is None:
38 raise NoDublinCore('Document has no DublinCore - which is required.')
40 self.book_info = dcparser.BookInfo.from_element(
41 self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
46 def from_string(cls, xml, *args, **kwargs):
47 return cls.from_file(StringIO(xml), *args, **kwargs)
50 def from_file(cls, xmlfile, *args, **kwargs):
52 # first, prepare for parsing
53 if isinstance(xmlfile, basestring):
54 file = open(xmlfile, 'rb')
62 if not isinstance(data, unicode):
63 data = data.decode('utf-8')
65 data = data.replace(u'\ufeff', '')
68 parser = etree.XMLParser(remove_blank_text=False)
69 tree = etree.parse(StringIO(data.encode('utf-8')), parser)
71 return cls(tree, *args, **kwargs)
72 except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
75 def swap_endlines(self):
76 """Converts line breaks in stanzas into <br/> tags."""
77 # only swap inside stanzas
78 for elem in self.edoc.iter('strofa'):
79 for child in list(elem):
81 chunks = self.LINE_SWAP_EXPR.split(child.tail)
82 ins_index = elem.index(child) + 1
83 while len(chunks) > 1:
84 ins = etree.Element('br')
85 ins.tail = chunks.pop()
86 elem.insert(ins_index, ins)
87 child.tail = chunks.pop(0)
89 chunks = self.LINE_SWAP_EXPR.split(elem.text)
90 while len(chunks) > 1:
91 ins = etree.Element('br')
92 ins.tail = chunks.pop()
94 elem.text = chunks.pop(0)
97 if self.provider is None:
98 raise NoProvider('No document provider supplied.')
99 if self.book_info is None:
100 raise NoDublinCore('No Dublin Core in document.')
101 for part_uri in self.book_info.parts:
102 yield self.from_file(self.provider.by_uri(part_uri),
103 provider=self.provider)
105 def chunk(self, path):
106 # convert the path to XPath
107 expr = self.path_to_xpath(path)
108 elems = self.edoc.xpath(expr)
115 def path_to_xpath(self, path):
118 for part in path.split('/'):
119 match = re.match(r'([^\[]+)\[(\d+)\]', part)
123 tag, n = match.groups()
124 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
129 return '/'.join(parts)
131 def transform(self, stylesheet, **options):
132 return self.edoc.xslt(stylesheet, **options)
136 parent = self.rdf_elem.getparent()
137 parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
141 return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
143 def merge_chunks(self, chunk_dict):
146 for key, data in chunk_dict.iteritems():
148 xpath = self.path_to_xpath(key)
149 node = self.edoc.xpath(xpath)[0]
150 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
151 node.getparent().replace(node, repl)
153 unmerged.append( repr( (key, xpath, e) ) )
157 def clean_ed_note(self):
158 """ deletes forbidden tags from nota_red """
160 for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
161 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
168 """Returns a set of all editors for book and its children.
170 :returns: set of dcparser.Person objects
172 if self.book_info is None:
173 raise NoDublinCore('No Dublin Core in document.')
174 persons = set(self.book_info.editors +
175 self.book_info.technical_editors)
176 for child in self.parts():
177 persons.update(child.editors())
184 def as_html(self, *args, **kwargs):
185 from librarian import html
186 return html.transform(self, *args, **kwargs)
188 def as_text(self, *args, **kwargs):
189 from librarian import text
190 return text.transform(self, *args, **kwargs)
192 def as_epub(self, *args, **kwargs):
193 from librarian import epub
194 return epub.transform(self, *args, **kwargs)
196 def as_pdf(self, *args, **kwargs):
197 from librarian import pdf
198 return pdf.transform(self, *args, **kwargs)
200 def as_mobi(self, *args, **kwargs):
201 from librarian import mobi
202 return mobi.transform(self, *args, **kwargs)
204 def as_fb2(self, *args, **kwargs):
205 from librarian import fb2
206 return fb2.transform(self, *args, **kwargs)
208 def save_output_file(self, output_file, output_path=None,
209 output_dir_path=None, make_author_dir=False, ext=None):
211 save_path = output_dir_path
213 save_path = os.path.join(save_path,
214 unicode(self.book_info.author).encode('utf-8'))
215 save_path = os.path.join(save_path,
216 self.book_info.uri.slug)
218 save_path += '.%s' % ext
220 save_path = output_path
222 output_file.save_as(save_path)