727f02e65bba631c63d77e06719b728c10044ec7
[librarian.git] / librarian / parser.py
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska  
4 #
5 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
6 # For full license text see COPYING or <http://www.gnu.org/licenses/agpl.html>
7 #
8 from librarian import ValidationError, NoDublinCore,  ParseError
9 from librarian import RDFNS
10 from librarian import dcparser
11
12 from xml.parsers.expat import ExpatError
13 from lxml import etree
14 from lxml.etree import XMLSyntaxError, XSLTApplyError
15
16 import re
17 from StringIO import StringIO
18
19 class WLDocument(object):
20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
21
22     def __init__(self, edoc, parse_dublincore=True):
23         self.edoc = edoc
24
25         root_elem = edoc.getroot()
26        
27         dc_path = './/' + RDFNS('RDF')
28         
29         if root_elem.tag != 'utwor':
30             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
31
32         if parse_dublincore:
33             self.rdf_elem = root_elem.find(dc_path)
34
35             if self.rdf_elem is None:
36                 raise NoDublinCore('Document has no DublinCore - which is required.')
37             
38             self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
39         else:
40             self.book_info = None
41     
42     @classmethod
43     def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
44         return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
45
46     @classmethod
47     def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
48
49         # first, prepare for parsing
50         if isinstance(xmlfile, basestring):
51             file = open(xmlfile, 'rb')
52             try:
53                 data = file.read()
54             finally:
55                 file.close()
56         else:
57             data = xmlfile.read()
58
59         if not isinstance(data, unicode):
60             data = data.decode('utf-8')
61
62         if swap_endlines:
63             data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
64     
65         try:
66             parser = etree.XMLParser(remove_blank_text=False)
67             return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
68         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
69             raise ParseError(e)                  
70
71     def chunk(self, path):
72         # convert the path to XPath        
73         expr = self.path_to_xpath(path)
74         elems = self.edoc.xpath(expr)
75
76         if len(elems) == 0:
77             return None
78         else:        
79             return elems[0]
80
81     def path_to_xpath(self, path):
82         parts = []
83
84         for part in path.split('/'):
85             match = re.match(r'([^\[]+)\[(\d+)\]', part)
86             if not match:
87                 parts.append(part)
88             else:
89                 tag, n = match.groups()
90                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
91
92         if parts[0] == '.':
93             parts[0] = ''
94
95         return '/'.join(parts)
96
97     def transform(self, stylesheet, **options):
98         return self.edoc.xslt(stylesheet, **options)
99
100     def update_dc(self):
101         if self.book_info:
102             parent = self.rdf_elem.getparent()
103             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
104
105     def serialize(self):
106         self.update_dc()
107         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
108
109     def merge_chunks(self, chunk_dict):
110         unmerged = []
111
112         for key, data in chunk_dict.iteritems():
113             try:
114                 xpath = self.path_to_xpath(key)
115                 node = self.edoc.xpath(xpath)[0]
116                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
117                 node.getparent().replace(node, repl);
118             except Exception, e:
119                 unmerged.append( repr( (key, xpath, e) ) )
120
121         return unmerged