librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska
   4 #
   5 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   6 # For full license text see COPYING or <http://www.gnu.org/licenses/agpl.html>
   7 #
   8 from librarian import ValidationError, NoDublinCore,  ParseError
   9 from librarian import RDFNS
  10 from librarian import dcparser
  11
  12 from xml.parsers.expat import ExpatError
  13 from lxml import etree
  14 from lxml.etree import XMLSyntaxError, XSLTApplyError
  15
  16 import re
  17 from StringIO import StringIO
  18
  19 class WLDocument(object):
  20     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
  21
  22     def __init__(self, edoc, parse_dublincore=True):
  23         self.edoc = edoc
  24
  25         root_elem = edoc.getroot()
  26
  27         dc_path = './/' + RDFNS('RDF')
  28
  29         if root_elem.tag != 'utwor':
  30             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  31
  32         if parse_dublincore:
  33             self.rdf_elem = root_elem.find(dc_path)
  34
  35             if self.rdf_elem is None:
  36                 raise NoDublinCore('Document has no DublinCore - which is required.')
  37
  38             self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
  39         else:
  40             self.book_info = None
  41
  42     @classmethod
  43     def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
  44         return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
  45
  46     @classmethod
  47     def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
  48
  49         # first, prepare for parsing
  50         if isinstance(xmlfile, basestring):
  51             file = open(xmlfile, 'rb')
  52             try:
  53                 data = file.read()
  54             finally:
  55                 file.close()
  56         else:
  57             data = xmlfile.read()
  58
  59         if not isinstance(data, unicode):
  60             data = data.decode('utf-8')
  61
  62         if swap_endlines:
  63             data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
  64
  65         try:
  66             parser = etree.XMLParser(remove_blank_text=False)
  67             return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
  68         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  69             raise ParseError(e)
  70
  71     def chunk(self, path):
  72         # convert the path to XPath
  73         expr = self.path_to_xpath(path)
  74         elems = self.edoc.xpath(expr)
  75
  76         if len(elems) == 0:
  77             return None
  78         else:
  79             return elems[0]
  80
  81     def path_to_xpath(self, path):
  82         parts = []
  83
  84         for part in path.split('/'):
  85             match = re.match(r'([^\[]+)\[(\d+)\]', part)
  86             if not match:
  87                 parts.append(part)
  88             else:
  89                 tag, n = match.groups()
  90                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
  91
  92         if parts[0] == '.':
  93             parts[0] = ''
  94
  95         return '/'.join(parts)
  96
  97     def transform(self, stylesheet, **options):
  98         return self.edoc.xslt(stylesheet, **options)
  99
 100     def update_dc(self):
 101         if self.book_info:
 102             parent = self.rdf_elem.getparent()
 103             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
 104
 105     def serialize(self):
 106         self.update_dc()
 107         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 108
 109     def merge_chunks(self, chunk_dict):
 110         unmerged = []
 111
 112         for key, data in chunk_dict.iteritems():
 113             try:
 114                 xpath = self.path_to_xpath(key)
 115                 node = self.edoc.xpath(xpath)[0]
 116                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
 117                 node.getparent().replace(node, repl);
 118             except Exception, e:
 119                 unmerged.append( repr( (key, xpath, e) ) )
 120
 121         return unmerged