# -*- coding: utf-8 -*-
+#
+# This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
import os
import cStringIO
import re
import copy
-import pkgutil
from lxml import etree
+from librarian.parser import WLDocument
+from librarian import XHTMLNS, ParseError
+from lxml.etree import XMLSyntaxError, XSLTApplyError
ENTITY_SUBSTITUTIONS = [
(u'---', u'—'),
(u'"', u'”'),
]
+STYLESHEETS = {
+ 'legacy': 'xslt/book2html.xslt',
+ 'full': 'xslt/wl2html_full.xslt',
+ 'partial': 'xslt/wl2html_partial.xslt'
+}
+
+def get_stylesheet(name):
+ return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
def substitute_entities(context, text):
"""XPath extension function converting all entites in passed text."""
text = text.replace(entity, substitutution)
return text
-
# Register substitute_entities function with lxml
ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
ns['substitute_entities'] = substitute_entities
-
-def transform(input_filename, output_filename):
+def transform(input, output_filename=None, is_file=True, \
+ parse_dublincore=True, stylesheet='legacy', options={}):
"""Transforms file input_filename in XML to output_filename in XHTML."""
# Parse XSLT
- style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt')
- style = etree.parse(style_filename)
-
- doc_file = cStringIO.StringIO()
- expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
-
- f = open(input_filename, 'r')
- for line in f:
- line = line.decode('utf-8')
- line = expr.sub(u'<br/>\n', line)
- doc_file.write(line.encode('utf-8'))
- f.close()
-
- doc_file.seek(0);
-
- parser = etree.XMLParser(remove_blank_text=True)
- doc = etree.parse(doc_file, parser)
+ try:
+ style_filename = get_stylesheet(stylesheet)
+ style = etree.parse(style_filename)
- result = doc.xslt(style)
- if result.find('//p') is not None:
- add_anchors(result.getroot())
- add_table_of_contents(result.getroot())
- result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
- return True
- else:
- return False
+ if is_file:
+ document = WLDocument.from_file(input, True, \
+ parse_dublincore=parse_dublincore)
+ else:
+ document = WLDocument.from_string(input, True, \
+ parse_dublincore=parse_dublincore)
+ result = document.transform(style, **options)
+ del document # no longer needed large object :)
+
+ if etree.ETXPath('//p|//{%s}p' % str(XHTMLNS))(result) is not None:
+ add_anchors(result.getroot())
+ add_table_of_contents(result.getroot())
+
+ if output_filename is not None:
+ result.write(output_filename, xml_declaration=False, pretty_print=True, encoding='utf-8')
+ else:
+ return result
+ return True
+ else:
+ return "<empty />"
+ except KeyError:
+ raise ValueError("'%s' is not a valid stylesheet.")
+ except (XMLSyntaxError, XSLTApplyError), e:
+ raise ParseError(e)
class Fragment(object):
def __init__(self, id, themes):