librarian/parser.py

   1 # -*- coding: utf-8 -*-
   2 from librarian import ValidationError, NoDublinCore,  ParseError
   3 from librarian import RDFNS, DCNS
   4 from librarian import dcparser
   5
   6 from xml.parsers.expat import ExpatError
   7 from lxml import etree
   8 from lxml.etree import XMLSyntaxError, XSLTApplyError
   9
  10 import re
  11 from StringIO import StringIO
  12
  13 class WLDocument(object):
  14     LINE_SWAP_EXPR = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
  15
  16     def __init__(self, edoc, parse_dublincore=True):
  17         self.edoc = edoc
  18
  19         root_elem = edoc.getroot()
  20
  21         dc_path = './/' + RDFNS('RDF')
  22
  23         if root_elem.tag != 'utwor':
  24             raise ValidationError("Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag)
  25
  26         if parse_dublincore:
  27             self.rdf_elem = root_elem.find(dc_path)
  28
  29             if self.rdf_elem is None:
  30                 raise NoDublinCore('Document has no DublinCore - which is required.')
  31
  32             self.book_info = dcparser.BookInfo.from_element(self.rdf_elem)
  33         else:
  34             self.book_info = None
  35
  36     @classmethod
  37     def from_string(cls, xml, swap_endlines=False, parse_dublincore=True):
  38         return cls.from_file(StringIO(xml), swap_endlines, parse_dublincore=parse_dublincore)
  39
  40     @classmethod
  41     def from_file(cls, xmlfile, swap_endlines=False, parse_dublincore=True):
  42
  43         # first, prepare for parsing
  44         if isinstance(xmlfile, basestring):
  45             file = open(xmlfile, 'rb')
  46             try:
  47                 data = file.read()
  48             finally:
  49                 file.close()
  50         else:
  51             data = xmlfile.read()
  52
  53         if not isinstance(data, unicode):
  54             data = data.decode('utf-8')
  55
  56         if swap_endlines:
  57             data = cls.LINE_SWAP_EXPR.sub(u'<br />\n', data)
  58
  59         try:
  60             parser = etree.XMLParser(remove_blank_text=True)
  61             return cls(etree.parse(StringIO(data), parser), parse_dublincore=parse_dublincore)
  62         except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
  63             raise ParseError(e)
  64
  65     def part_as_text(self, path):
  66         # convert the path to XPath
  67         print "[L] Retrieving part:", path
  68
  69         elems = self.edoc.xpath(self.path_to_xpath(path))
  70         print "[L] xpath", elems
  71
  72         if len(elems) == 0:
  73             return None
  74
  75         return etree.tostring(elems[0], encoding=unicode, pretty_print=True)
  76
  77
  78     def path_to_xpath(self, path):
  79         parts = []
  80
  81         for part in path.split('/'):
  82             match = re.match(r'([^\[]+)\[(\d+)\]', part)
  83             if not match:
  84                 parts.append(part)
  85             else:
  86                 tag, n = match.groups()
  87                 parts.append("node()[position() = %d and name() = '%s']" % (int(n), tag) )
  88
  89         if parts[0] == '.':
  90             parts[0] = ''
  91
  92         return '/'.join(parts)
  93
  94     def transform(self, stylesheet, **options):
  95         return self.edoc.xslt(stylesheet, **options)
  96
  97     def update_dc(self):
  98         parent = self.rdf_elem.getparent()
  99         parent.replace( self.rdf_elem, self.book_info.to_etree(parent) )
 100
 101     def serialize(self):
 102         self.update_dc()
 103         return etree.tostring(self.edoc, encoding=unicode, pretty_print=True)
 104
 105     def merge_chunks(self, chunk_dict):
 106         unmerged = []
 107
 108         for key, data in chunk_dict.iteritems():
 109             try:
 110                 xpath = self.path_to_xpath(key)
 111                 node = self.edoc.xpath(xpath)[0]
 112                 repl = etree.fromstring(data)
 113                 node.getparent().replace(node, repl);
 114             except Exception, e:
 115                 unmerged.append( repr( (key, xpath, e) ) )
 116
 117         return unmerged