librarian/document.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from StringIO import StringIO
   7 from lxml import etree
   8 from . import SSTNS
   9 from .core import Section
  10 from .parser import SSTParser
  11
  12
  13 class Document(object):
  14     def __init__(self, edoc, meta_context=None):
  15         self.edoc = edoc
  16         root_elem = edoc.getroot()
  17         # Do I use meta_context?
  18         if meta_context is not None:
  19             root_elem.meta_context = meta_context
  20         self.validate()
  21
  22     def validate(self):
  23         root_elem = self.edoc.getroot()
  24         if not isinstance(root_elem, Section):
  25             if root_elem.tag != SSTNS('section'):
  26                 if root_elem.tag == 'section':
  27                     for element in root_elem.iter():
  28                         if element.tag in ('section', 'header', 'div', 'span', 'aside', 'metadata'):
  29                             element.tag = str(SSTNS(element.tag))
  30
  31                     parser = SSTParser()
  32                     tree = etree.parse(StringIO(etree.tostring(root_elem)), parser)
  33                     tree.xinclude()
  34                     self.edoc = tree
  35                 else:
  36                     raise ValueError("Invalid root element. Found '%s', should be '%s'" % (
  37                         root_elem.tag, SSTNS('section')))
  38             else:
  39                 raise ValueError("Invalid class of root element. Use librarian.parser.SSTParser.")
  40
  41     @classmethod
  42     def from_string(cls, xml, *args, **kwargs):
  43         return cls.from_file(StringIO(xml), *args, **kwargs)
  44
  45     @classmethod
  46     def from_file(cls, xmlfile, *args, **kwargs):
  47         # first, prepare for parsing
  48         if isinstance(xmlfile, basestring):
  49             file = open(xmlfile, 'rb')
  50             try:
  51                 data = file.read()
  52             finally:
  53                 file.close()
  54         else:
  55             data = xmlfile.read()
  56
  57         if not isinstance(data, unicode):
  58             data = data.decode('utf-8')
  59
  60         data = data.replace(u'\ufeff', '')
  61         # This is bad. The editor shouldn't spew unknown HTML entities.
  62         data = data.replace(u'&nbsp;', u'\u00a0')
  63
  64         parser = SSTParser()
  65         tree = etree.parse(StringIO(data.encode('utf-8')), parser)
  66         tree.xinclude()
  67         return cls(tree, *args, **kwargs)
  68
  69     @property
  70     def meta(self):
  71         """ Document's metadata is root's metadata. """
  72         return self.edoc.getroot().meta