1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from librarian import ValidationError, NoDublinCore, ParseError, NoProvider
7 from librarian import RDFNS
8 from librarian import dcparser
10 from xml.parsers.expat import ExpatError
11 from lxml import etree
12 from lxml.etree import XMLSyntaxError, XSLTApplyError
16 from StringIO import StringIO
18 class WLDocument(object):
19 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE)
22 def __init__(self, edoc, parse_dublincore=True, provider=None, strict=False):
24 self.provider = provider
26 root_elem = edoc.getroot()
28 dc_path = './/' + RDFNS('RDF')
30 if root_elem.tag != 'utwor':
31 raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
34 self.rdf_elem = root_elem.find(dc_path)
36 if self.rdf_elem is None:
37 raise NoDublinCore('Document has no DublinCore - which is required.')
39 self.book_info = dcparser.BookInfo.from_element(
40 self.rdf_elem, strict=strict)
45 def from_string(cls, xml, *args, **kwargs):
46 return cls.from_file(StringIO(xml), *args, **kwargs)
49 def from_file(cls, xmlfile, parse_dublincore=True, provider=None):
51 # first, prepare for parsing
52 if isinstance(xmlfile, basestring):
53 file = open(xmlfile, 'rb')
61 if not isinstance(data, unicode):
62 data = data.decode('utf-8')
64 data = data.replace(u'\ufeff', '')
67 parser = etree.XMLParser(remove_blank_text=False)
68 tree = etree.parse(StringIO(data.encode('utf-8')), parser)
70 return cls(tree, parse_dublincore=parse_dublincore, provider=provider)
71 except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
74 def swap_endlines(self):
75 """Converts line breaks in stanzas into <br/> tags."""
76 # only swap inside stanzas
77 for elem in self.edoc.iter('strofa'):
78 for child in list(elem):
80 chunks = self.LINE_SWAP_EXPR.split(child.tail)
81 ins_index = elem.index(child) + 1
82 while len(chunks) > 1:
83 ins = etree.Element('br')
84 ins.tail = chunks.pop()
85 elem.insert(ins_index, ins)
86 child.tail = chunks.pop(0)
88 chunks = self.LINE_SWAP_EXPR.split(elem.text)
89 while len(chunks) > 1:
90 ins = etree.Element('br')
91 ins.tail = chunks.pop()
93 elem.text = chunks.pop(0)
96 if self.provider is None:
97 raise NoProvider('No document provider supplied.')
98 if self.book_info is None:
99 raise NoDublinCore('No Dublin Core in document.')
100 for part_uri in self.book_info.parts:
101 yield self.from_file(self.provider.by_uri(part_uri),
102 provider=self.provider)
104 def chunk(self, path):
105 # convert the path to XPath
106 expr = self.path_to_xpath(path)
107 elems = self.edoc.xpath(expr)
114 def path_to_xpath(self, path):
117 for part in path.split('/'):
118 match = re.match(r'([^\[]+)\[(\d+)\]', part)
122 tag, n = match.groups()
123 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
128 return '/'.join(parts)
130 def transform(self, stylesheet, **options):
131 return self.edoc.xslt(stylesheet, **options)
135 parent = self.rdf_elem.getparent()
136 parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
140 return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
142 def merge_chunks(self, chunk_dict):
145 for key, data in chunk_dict.iteritems():
147 xpath = self.path_to_xpath(key)
148 node = self.edoc.xpath(xpath)[0]
149 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
150 node.getparent().replace(node, repl)
152 unmerged.append( repr( (key, xpath, e) ) )
156 def clean_ed_note(self):
157 """ deletes forbidden tags from nota_red """
159 for node in self.edoc.xpath('|'.join('//nota_red//%s' % tag for tag in
160 ('pa', 'pe', 'pr', 'pt', 'begin', 'end', 'motyw'))):
167 """Returns a set of all editors for book and its children.
169 :returns: set of dcparser.Person objects
171 if self.book_info is None:
172 raise NoDublinCore('No Dublin Core in document.')
173 persons = set(self.book_info.editors +
174 self.book_info.technical_editors)
175 for child in self.parts():
176 persons.update(child.editors())
183 def as_html(self, *args, **kwargs):
184 from librarian import html
185 return html.transform(self, *args, **kwargs)
187 def as_text(self, *args, **kwargs):
188 from librarian import text
189 return text.transform(self, *args, **kwargs)
191 def as_epub(self, *args, **kwargs):
192 from librarian import epub
193 return epub.transform(self, *args, **kwargs)
195 def as_pdf(self, *args, **kwargs):
196 from librarian import pdf
197 return pdf.transform(self, *args, **kwargs)
199 def as_mobi(self, *args, **kwargs):
200 from librarian import mobi
201 return mobi.transform(self, *args, **kwargs)
203 def as_fb2(self, *args, **kwargs):
204 from librarian import fb2
205 return fb2.transform(self, *args, **kwargs)
207 def save_output_file(self, output_file, output_path=None,
208 output_dir_path=None, make_author_dir=False, ext=None):
210 save_path = output_dir_path
212 save_path = os.path.join(save_path,
213 unicode(self.book_info.author).encode('utf-8'))
214 save_path = os.path.join(save_path,
215 self.book_info.uri.slug)
217 save_path += '.%s' % ext
219 save_path = output_path
221 output_file.save_as(save_path)