From 239d66922f4b83ee5baaa284a9c33a32bfcb99a4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= <marek@stepniowski.com> Date: Thu, 28 Aug 2008 14:28:07 +0200 Subject: [PATCH] Added book2html and bookfragments utilities to repository. --- bin/book2html.py | 55 ++++++++++++++++ bin/book2html.xslt | 150 +++++++++++++++++++++++++++++++++++++++++++ bin/bookfragments.py | 148 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100755 bin/book2html.py create mode 100644 bin/book2html.xslt create mode 100755 bin/bookfragments.py diff --git a/bin/book2html.py b/bin/book2html.py new file mode 100755 index 000000000..5bd2bb527 --- /dev/null +++ b/bin/book2html.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import cStringIO +import re +import optparse +import os +import sys + +from lxml import etree + + +# Parse args +usage = """Usage: %prog [options] SOURCE [SOURCE...] +Convert SOURCE files to HTML format.""" + +parser = optparse.OptionParser(usage=usage) + +parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + +options, input_filenames = parser.parse_args() + +if len(input_filenames) < 1: + parser.print_help() + exit(1) + +# Parse XSLT +style = etree.parse('book2html.xslt') + +# Do some real work +for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + + # Transform + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'<br/>\n', line).replace(u'---', u'â').replace(u',,', u'â') + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0); + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') + diff --git a/bin/book2html.xslt b/bin/book2html.xslt new file mode 100644 index 000000000..6cba758c0 --- /dev/null +++ b/bin/book2html.xslt @@ -0,0 +1,150 @@ +<?xml version="1.0" encoding="UTF-8"?> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"> + +<xsl:output method="xml" encoding="utf-8" doctype-public="-//W3C//DTD XHTML 1.1//EN" doctype-system="http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" indent="yes" /> + +<xsl:template match="text()" /> + +<xsl:template match="extra|uwaga" /> +<xsl:template match="extra|uwaga" mode="inline" /> + +<xsl:template match="/"> + <html> + <head> + <title>book2html output</title> + <meta http-equiv="content-type" content="text/html;charset=utf-8"/> + <link rel="stylesheet" href="master.css" type="text/css" media="screen" charset="utf-8" /> + </head> + <body> + <xsl:apply-templates /> + <div id="footnotes"> + <h3>Przypisy</h3> + <xsl:for-each select="descendant::*[self::pe or self::pa or self::pr or self::pt][not(parent::extra)]"> + <div> + <a name="{concat('footnote-', generate-id(.))}" /> + <a href="{concat('#anchor-', generate-id(.))}" class="annotation">[<xsl:number value="count(preceding::*[self::pa or self::pe or self::pr or self::pt]) + 1" />]</a> + <xsl:choose> + <xsl:when test="count(akap|akap_cd|strofa) = 0"> + <p><xsl:apply-templates select="text()|*" mode="inline" /></p> + </xsl:when> + <xsl:otherwise> + <xsl:apply-templates select="text()|*" mode="inline" /> + </xsl:otherwise> + </xsl:choose> + </div> + </xsl:for-each> + </div> + </body> + </html> +</xsl:template> + +<xsl:template match="naglowek_akt|naglowek_czesc"> + <h2><xsl:apply-templates mode="inline" /></h2> +</xsl:template> + +<xsl:template match="naglowek_scena|naglowek_rozdzial"> + <h3><xsl:apply-templates mode="inline" /></h3> +</xsl:template> + +<xsl:template match="naglowek_osoba"> + <h4><xsl:apply-templates mode="inline" /></h4> +</xsl:template> + +<xsl:template match="kwestia"> + <div class="kwestia"> + <xsl:apply-templates select="strofa|akap" /> + </div> +</xsl:template> + +<xsl:template match="didaskalia"> + <div class="didaskalia"><xsl:apply-templates mode="inline" /></div> +</xsl:template> + +<xsl:template match="lista_osob"> + <div class="person-list"> + <h3><xsl:value-of select="naglowek_listy" /></h3> + <ol> + <xsl:apply-templates select="lista_osoba" /> + </ol> + </div> +</xsl:template> + +<xsl:template match="lista_osoba"> + <li><xsl:apply-templates mode="inline" /></li> +</xsl:template> + +<xsl:template match="begin" mode="inline"> + <xsl:variable name="mnum" select="concat('m', substring(@id, 2))" /> + <span class="theme-begin" fid="{substring(@id, 2)}"> + <xsl:value-of select="string(following::motyw[@id=$mnum]/text())" /> + </span> +</xsl:template> + +<xsl:template match="end" mode="inline"> + <span class="theme-end" fid="{substring(@id, 2)}"> </span> +</xsl:template> + +<xsl:template match="begin|end"> + <xsl:apply-templates select='.' mode="inline" /> +</xsl:template> + +<xsl:template name="verse"> + <xsl:param name="line-content" /> + <xsl:param name="line-number" /> + <p> + <xsl:choose> + <xsl:when test="name($line-content) = 'wers_akap'"> + <xsl:attribute name="style">indent: 1em</xsl:attribute> + </xsl:when> + <xsl:when test="name($line-content) = 'wers_wciety'"> + <xsl:attribute name="style">indent: 2em</xsl:attribute> + </xsl:when> + </xsl:choose> + <xsl:apply-templates select="$line-content" mode="inline" /> + </p> +</xsl:template> + +<xsl:template match="pa|pe|pr|pt" mode="inline"> + <a name="{concat('anchor-', generate-id(.))}" /> + <a href="{concat('#footnote-', generate-id(.))}" class="annotation">[<xsl:number value="count(preceding::*[self::pa or self::pe or self::pr or self::pt]) + 1" />]</a> +</xsl:template> + +<xsl:template match="strofa"> + <div class="stanza"> + <xsl:choose> + <xsl:when test="count(br) > 0"> + <xsl:call-template name="verse"> + <xsl:with-param name="line-content" select="br[1]/preceding-sibling::text() | br[1]/preceding-sibling::node()" /> + <xsl:with-param name="line-number" select="1" /> + </xsl:call-template> + <xsl:for-each select="br"> + <!-- Każdy BR "zjada" to co jest za nim --> + <xsl:variable name="lnum" select="count(preceding-sibling::br)" /> + <xsl:call-template name="verse"> + <xsl:with-param name="line-number" select="$lnum+2" /> + <xsl:with-param name="line-content" + select="following-sibling::text()[count(preceding-sibling::br) = $lnum+1] | following-sibling::node()[count(preceding-sibling::br) = $lnum+1]" /> + </xsl:call-template> + </xsl:for-each> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="verse"> + <xsl:with-param name="line-content" select="text() | node()" /> + <xsl:with-param name="line-number" select="1" /> + </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + </div> +</xsl:template> + +<xsl:template match="akap|akap_dialog|akap_cd"> + <p class="paragraph"><xsl:apply-templates mode="inline" /></p> +</xsl:template> + +<xsl:template match="motyw" mode="inline" /> + +<xsl:template match="dlugi_cytat"> + <blockquote><xsl:apply-templates /></blockquote> +</xsl:template> + +</xsl:stylesheet> \ No newline at end of file diff --git a/bin/bookfragments.py b/bin/bookfragments.py new file mode 100755 index 000000000..73d271116 --- /dev/null +++ b/bin/bookfragments.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import optparse +import os +import copy + +from lxml import etree + + +# Parse args +usage = """Usage: %prog [options] SOURCE [SOURCE...] +Extract theme fragments from SOURCE.""" + +parser = optparse.OptionParser(usage=usage) + +parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + +options, input_filenames = parser.parse_args() + +if len(input_filenames) < 1: + parser.print_help() + exit(1) + + +class Fragment(object): + def __init__(self, id, themes): + super(Fragment, self).__init__() + self.id = id + self.themes = themes + self.events = [] + + def append(self, event, element): + self.events.append((event, element)) + + def closed_events(self): + stack = [] + for event, element in self.events: + if event == 'start': + stack.append(('end', element)) + elif event == 'end': + try: + stack.pop() + except IndexError: + print 'CLOSED NON-OPEN TAG:', element + + stack.reverse() + return self.events + stack + + def to_string(self): + result = [] + for event, element in self.closed_events(): + if event == 'start': + result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) + if element.text: + result.append(element.text) + elif event == 'end': + result.append(u'</%s>' % element.tag) + if element.tail: + result.append(element.tail) + else: + result.append(element) + + return ''.join(result) + + def __unicode__(self): + return self.to_string() + + +# Do some real work +for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + open_fragments = {} + closed_fragments = {} + lost_text = [] + + for event, element in etree.iterparse(input_filename, events=('start', 'end')): + + # Process begin and end elements + if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): + if not event == 'end': continue # Process elements only once, on end event + + # Open new fragment + if element.get('class', '') == 'theme-begin': + fragment = Fragment(id=element.get('fid'), themes=element.text) + + # Append parents + if element.getparent().tag != 'body': + parents = [element.getparent()] + while parents[-1].getparent().tag != 'body': + parents.append(parents[-1].getparent()) + + parents.reverse() + for parent in parents: + fragment.append('start', parent) + + open_fragments[fragment.id] = fragment + + # Close existing fragment + else: + try: + fragment = open_fragments[element.get('fid')] + except KeyError: + print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) + else: + closed_fragments[fragment.id] = fragment + del open_fragments[fragment.id] + + # Append element tail to lost_text (we don't want to lose any text) + if element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + + + # Process all elements except begin and end + else: + # Omit annotation tags + if len(element.get('name', '')) or element.get('class', '') == 'annotation': + if event == 'end' and element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + else: + for fragment_id in open_fragments: + open_fragments[fragment_id].append(event, copy.copy(element)) + + + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + + output_file = open(output_filename, 'w') + output_file.write(""" + <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> + <html><head> + <title>bookfragments output</title> + <meta http-equiv="content-type" content="text/html;charset=utf-8"/> + <link rel="stylesheet" href="master.css" type="text/css" media="screen" charset="utf-8" /> + </head> + <body>""") + for fragment in closed_fragments.values(): + html = u'<div class="fragment"><h3>[#%s] %s</h3>%s</div>' % (fragment.id, fragment.themes, fragment) + output_file.write(html.encode('utf-8')) + output_file.write('</body></html>') + output_file.close() + -- 2.20.1