X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/07fdba2c7fe8e11b6867712d47bdd608e88c29fb..993a8cd64b2913a6d1c42f094c37c6c2d66d2332:/librarian/html.py diff --git a/librarian/html.py b/librarian/html.py index b279e5d..c2e0dea 100644 --- a/librarian/html.py +++ b/librarian/html.py @@ -1,12 +1,18 @@ # -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# import os import cStringIO import re import copy -import pkgutil from lxml import etree +from librarian.parser import WLDocument +from librarian import XHTMLNS, ParseError +from lxml.etree import XMLSyntaxError, XSLTApplyError ENTITY_SUBSTITUTIONS = [ (u'---', u'—'), @@ -16,6 +22,14 @@ ENTITY_SUBSTITUTIONS = [ (u'"', u'”'), ] +STYLESHEETS = { + 'legacy': 'xslt/book2html.xslt', + 'full': 'xslt/wl2html_full.xslt', + 'partial': 'xslt/wl2html_partial.xslt' +} + +def get_stylesheet(name): + return os.path.join(os.path.dirname(__file__), STYLESHEETS[name]) def substitute_entities(context, text): """XPath extension function converting all entites in passed text.""" @@ -25,42 +39,43 @@ def substitute_entities(context, text): text = text.replace(entity, substitutution) return text - # Register substitute_entities function with lxml ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') ns['substitute_entities'] = substitute_entities - -def transform(input_filename, output_filename): +def transform(input, output_filename=None, is_file=True, \ + parse_dublincore=True, stylesheet='legacy', options={}): """Transforms file input_filename in XML to output_filename in XHTML.""" # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') - style = etree.parse(style_filename) - - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
\n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0); - - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) + try: + style_filename = get_stylesheet(stylesheet) + style = etree.parse(style_filename) - result = doc.xslt(style) - if result.find('//p') is not None: - add_anchors(result.getroot()) - add_table_of_contents(result.getroot()) - result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') - return True - else: - return False + if is_file: + document = WLDocument.from_file(input, True, \ + parse_dublincore=parse_dublincore) + else: + document = WLDocument.from_string(input, True, \ + parse_dublincore=parse_dublincore) + result = document.transform(style, **options) + del document # no longer needed large object :) + + if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None: + add_anchors(result.getroot()) + add_table_of_contents(result.getroot()) + + if output_filename is not None: + result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8') + else: + return result + return True + else: + return "" + except KeyError: + raise ValueError("'%s' is not a valid stylesheet.") + except (XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) class Fragment(object): def __init__(self, id, themes):