1 # -*- coding: utf-8 -*-
3 # Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska
5 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
6 # For full license text see COPYING or <http://www.gnu.org/licenses/agpl.html>
8 from librarian import ValidationError, NoDublinCore, ParseError
9 from librarian import RDFNS
10 from librarian import dcparser
12 from xml.parsers.expat import ExpatError
13 from lxml import etree
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 from StringIO import StringIO
19 class WLDocument(object):
20 LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
22 def __init__(self, edoc, parse_dublincore=True):
25 root_elem = edoc.getroot()
27 dc_path = './/' + RDFNS('RDF')
29 if root_elem.tag != 'utwor':
30 raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
33 self.rdf_elem = root_elem.find(dc_path)
35 if self.rdf_elem is None:
36 raise NoDublinCore('Document has no DublinCore - which is required.')
38 self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
43 def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
44 return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
47 def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
49 # first, prepare for parsing
50 if isinstance(xmlfile, basestring):
51 file = open(xmlfile, 'rb')
59 if not isinstance(data, unicode):
60 data = data.decode('utf-8')
63 data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
66 parser = etree.XMLParser(remove_blank_text=False)
67 return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
68 except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
71 def chunk(self, path):
72 # convert the path to XPath
73 expr = self.path_to_xpath(path)
74 elems = self.edoc.xpath(expr)
81 def path_to_xpath(self, path):
84 for part in path.split('/'):
85 match = re.match(r'([^\[]+)\[(\d+)\]', part)
89 tag, n = match.groups()
90 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
95 return '/'.join(parts)
97 def transform(self, stylesheet, **options):
98 return self.edoc.xslt(stylesheet, **options)
102 parent = self.rdf_elem.getparent()
103 parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
107 return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
109 def merge_chunks(self, chunk_dict):
112 for key, data in chunk_dict.iteritems():
114 xpath = self.path_to_xpath(key)
115 node = self.edoc.xpath(xpath)[0]
116 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
117 node.getparent().replace(node, repl);
119 unmerged.append( repr( (key, xpath, e) ) )