librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 #    This file is part of Librarian.
   4 #
   5 #    Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska <fundacja@nowoczesnapolska.org.pl>
   6 #
   7 #    For full list of contributors see AUTHORS file.
   8 #
   9 #    This program is free software: you can redistribute it and/or modify
  10 #    it under the terms of the GNU Affero General Public License as published by
  11 #    the Free Software Foundation, either version 3 of the License, or
  12 #    (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU Affero General Public License for more details.
  18 #
  19 #    You should have received a copy of the GNU Affero General Public License
  20 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 #
  22 from librarian import ValidationError, NoDublinCore,  ParseError
  23 from librarian import RDFNS
  24 from librarian import dcparser
  25
  26 from xml.parsers.expat import ExpatError
  27 from lxml import etree
  28 from lxml.etree import XMLSyntaxError, XSLTApplyError
  29
  30 import re
  31 from StringIO import StringIO
  32
  33 class WLDocument(object):
  34     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
  35
  36     def __init__(self, edoc, parse_dublincore=True):
  37         self.edoc = edoc
  38
  39         root_elem = edoc.getroot()
  40
  41         dc_path = './/' + RDFNS('RDF')
  42
  43         if root_elem.tag != 'utwor':
  44             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  45
  46         if parse_dublincore:
  47             self.rdf_elem = root_elem.find(dc_path)
  48
  49             if self.rdf_elem is None:
  50                 raise NoDublinCore('Document has no DublinCore - which is required.')
  51
  52             self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
  53         else:
  54             self.book_info = None
  55
  56     @classmethod
  57     def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
  58         return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
  59
  60     @classmethod
  61     def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
  62
  63         # first, prepare for parsing
  64         if isinstance(xmlfile, basestring):
  65             file = open(xmlfile, 'rb')
  66             try:
  67                 data = file.read()
  68             finally:
  69                 file.close()
  70         else:
  71             data = xmlfile.read()
  72
  73         if not isinstance(data, unicode):
  74             data = data.decode('utf-8')
  75
  76         if swap_endlines:
  77             data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
  78
  79         try:
  80             parser = etree.XMLParser(remove_blank_text=False)
  81             return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
  82         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  83             raise ParseError(e)
  84
  85     def chunk(self, path):
  86         # convert the path to XPath
  87         expr = self.path_to_xpath(path)
  88         elems = self.edoc.xpath(expr)
  89
  90         if len(elems) == 0:
  91             return None
  92         else:
  93             return elems[0]
  94
  95     def path_to_xpath(self, path):
  96         parts = []
  97
  98         for part in path.split('/'):
  99             match = re.match(r'([^\[]+)\[(\d+)\]', part)
 100             if not match:
 101                 parts.append(part)
 102             else:
 103                 tag, n = match.groups()
 104                 parts.append("*[%d][name() = '%s']" % (int(n)+1, tag) )
 105
 106         if parts[0] == '.':
 107             parts[0] = ''
 108
 109         return '/'.join(parts)
 110
 111     def transform(self, stylesheet, **options):
 112         return self.edoc.xslt(stylesheet, **options)
 113
 114     def update_dc(self):
 115         if self.book_info:
 116             parent = self.rdf_elem.getparent()
 117             parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
 118
 119     def serialize(self):
 120         self.update_dc()
 121         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 122
 123     def merge_chunks(self, chunk_dict):
 124         unmerged = []
 125
 126         for key, data in chunk_dict.iteritems():
 127             try:
 128                 xpath = self.path_to_xpath(key)
 129                 node = self.edoc.xpath(xpath)[0]
 130                 repl = etree.fromstring(u"<%s>%s</%s>" %(node.tag, data, node.tag) )
 131                 node.getparent().replace(node, repl);
 132             except Exception, e:
 133                 unmerged.append( repr( (key, xpath, e) ) )
 134
 135         return unmerged