1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian.
5 # Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska <fundacja@nowoczesnapolska.org.pl>
7 # For full list of contributors see AUTHORS file.
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU Affero General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Affero General Public License for more details.
19 # You should have received a copy of the GNU Affero General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22 from librarian import ValidationError, NoDublinCore, ParseError
23 from librarian import RDFNS
24 from librarian import dcparser
26 from xml.parsers.expat import ExpatError
27 from lxml import etree
28 from lxml.etree import XMLSyntaxError, XSLTApplyError
31 from StringIO import StringIO
33 class WLDocument(object):
34 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
36 def __init__(self, edoc, parse_dublincore=True):
39 root_elem = edoc.getroot()
41 dc_path = './/' + RDFNS('RDF')
43 if root_elem.tag != 'utwor':
44 raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
47 self.rdf_elem = root_elem.find(dc_path)
49 if self.rdf_elem is None:
50 raise NoDublinCore('Document has no DublinCore - which is required.')
52 self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
57 def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
58 return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
61 def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
63 # first, prepare for parsing
64 if isinstance(xmlfile, basestring):
65 file = open(xmlfile, 'rb')
73 if not isinstance(data, unicode):
74 data = data.decode('utf-8')
77 data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
80 parser = etree.XMLParser(remove_blank_text=False)
81 return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
82 except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
85 def chunk(self, path):
86 # convert the path to XPath
87 expr = self.path_to_xpath(path)
88 elems = self.edoc.xpath(expr)
95 def path_to_xpath(self, path):
98 for part in path.split('/'):
99 match = re.match(r'([^\[]+)\[(\d+)\]', part)
103 tag, n = match.groups()
104 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
109 return '/'.join(parts)
111 def transform(self, stylesheet, **options):
112 return self.edoc.xslt(stylesheet, **options)
116 parent = self.rdf_elem.getparent()
117 parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
121 return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
123 def merge_chunks(self, chunk_dict):
126 for key, data in chunk_dict.iteritems():
128 xpath = self.path_to_xpath(key)
129 node = self.edoc.xpath(xpath)[0]
130 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
131 node.getparent().replace(node, repl);
133 unmerged.append( repr( (key, xpath, e) ) )