bookfragments output

From 648dff98814d2ddeda2f06a8c0e2fb01547fac2a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Wed, 3 Sep 2008 18:24:30 +0200 Subject: [PATCH] Moved book2html and bookfragments into its own librarian package. --- bin/book2html.py | 83 --------- lib/librarian/__init__.py | 0 lib/librarian/bin/book2html.py | 31 ++++ lib/librarian/bin/bookfragments.py | 50 ++++++ {bin => lib/librarian/bin}/master.css | 0 lib/librarian/bin/master.plain.css | 160 ++++++++++++++++++ {bin => lib/librarian}/book2html.xslt | 0 bin/bookfragments.py => lib/librarian/html.py | 129 +++++++------- 8 files changed, 308 insertions(+), 145 deletions(-) delete mode 100755 bin/book2html.py create mode 100644 lib/librarian/__init__.py create mode 100755 lib/librarian/bin/book2html.py create mode 100755 lib/librarian/bin/bookfragments.py rename {bin => lib/librarian/bin}/master.css (100%) create mode 100644 lib/librarian/bin/master.plain.css rename {bin => lib/librarian}/book2html.xslt (100%) rename bin/bookfragments.py => lib/librarian/html.py (65%) mode change 100755 => 100644 diff --git a/bin/book2html.py b/bin/book2html.py deleted file mode 100755 index 3907de38c..000000000 --- a/bin/book2html.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import cStringIO -import re -import optparse -import os -import sys - -from lxml import etree - - -ENTITY_SUBSTITUTIONS = [ - (u'---', u'â'), - (u'--', u'â'), - (u'...', u'â¦'), - (u',,', u'â'), - (u'"', u'â'), -] - - -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['substitute_entities'] = substitute_entities - - -def transform(input_filename, output_filename): - """Transforms file input_filename in XML to output_filename in XHTML.""" - # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') - style = etree.parse(style_filename) - - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
\n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0); - - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) - - result = doc.xslt(style) - result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Convert SOURCE files to HTML format.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.html' - transform(input_filename, output_filename) - diff --git a/lib/librarian/__init__.py b/lib/librarian/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lib/librarian/bin/book2html.py b/lib/librarian/bin/book2html.py new file mode 100755 index 000000000..a0229bbe7 --- /dev/null +++ b/lib/librarian/bin/book2html.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to HTML format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + html.transform(input_filename, output_filename) + diff --git a/lib/librarian/bin/bookfragments.py b/lib/librarian/bin/bookfragments.py new file mode 100755 index 000000000..f29e11e02 --- /dev/null +++ b/lib/librarian/bin/bookfragments.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Extract theme fragments from SOURCE.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + closed_fragments, open_fragments = html.extract_fragments(input_filename) + + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + + output_file = open(output_filename, 'w') + output_file.write(""" + + + bookfragments output + + + + """) + for fragment in closed_fragments.values(): + fragment_html = u'

[#%s] %s

' % (fragment.id, fragment.themes, fragment) + output_file.write(fragment_html.encode('utf-8')) + output_file.write('') + output_file.close() + diff --git a/bin/master.css b/lib/librarian/bin/master.css similarity index 100% rename from bin/master.css rename to lib/librarian/bin/master.css diff --git a/lib/librarian/bin/master.plain.css b/lib/librarian/bin/master.plain.css new file mode 100644 index 000000000..3210e8819 --- /dev/null +++ b/lib/librarian/bin/master.plain.css @@ -0,0 +1,160 @@ +body { + font-size: 16px; + font: Georgia, "Times New Roman", serif; + line-height: 1.5em; + margin: 3em; + max-width: 36em; +} + +a { + color: blue; + text-decoration: none; +} + +/* =================================================== */ +/* = Common elements: headings, paragraphs and lines = */ +/* =================================================== */ +h1 { + font-size: 3em; + margin: 1.5em 0; + text-align: center; + line-height: 1.5em; + font-weight: bold; +} + +h2 { + font-size: 2em; + margin: 1.5em 0 0; + font-weight: bold; + line-height: 1.5em; +} + +h3 { + font-size: 1.5em; + margin: 1.5em 0 0; + font-weight: normal; + line-height: 1.5em; +} + +h4 { + font-size: 1em; + margin: 1.5em 0 0; + line-height: 1.5em; +} + +p { + margin: 0; +} + +/* ======================== */ +/* = Footnotes and themes = */ +/* ======================== */ +.theme-begin { + border-left: 0.1em solid #DDDDDD; + color: #666; + float: right; + margin: 0 -9.5em 0 0; + padding: 0 0.5em; + width: 7.5em; + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +.annotation { + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +#footnotes { + display: none; +} + +#footnotes .annotation { + display: block; + float: left; + width: 2.5em; + clear: both; +} + +#footnotes div { + margin: 1.5em 0 0 0; +} + +#footnotes p { + margin-left: 2.5em; +} + +/* =================== */ +/* = Custom elements = */ +/* =================== */ +span.author { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: 0.25em; +} + +span.collection { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: -0.25em; +} + +span.subtitle { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-top: -0.25em; +} + +div.didaskalia { + font-style: italic; + margin: 0.5em 0 0; +} + +div.kwestia { + margin: 0.5em 0 0; +} + +div.stanza { + margin: 1.5em 0 0; +} + +div.kwestia div.stanza { + margin: 0; +} + +p.paragraph { + text-align: justify; + margin: 1.5em 0 0; +} + +p.motto { + text-align: justify; + font-style: italic; + margin: 1.5em 0 0; +} + +p.motto_podpis { + font-size: 0.875em; +} + +div.fragment { + border-bottom: 0.1em solid #999; + padding-bottom: 1.5em; +} + +div.note p, div.note p.paragraph { + text-align: right; + font-style: italic; +} + +hr.spacer { + height: 3em; + visibility: hidden; +} diff --git a/bin/book2html.xslt b/lib/librarian/book2html.xslt similarity index 100% rename from bin/book2html.xslt rename to lib/librarian/book2html.xslt diff --git a/bin/bookfragments.py b/lib/librarian/html.py old mode 100755 new mode 100644 similarity index 65% rename from bin/bookfragments.py rename to lib/librarian/html.py index ea2b0b7b7..ae5efa644 --- a/bin/bookfragments.py +++ b/lib/librarian/html.py @@ -1,22 +1,71 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- -import optparse import os +import cStringIO +import re import copy +import pkgutil from lxml import etree +ENTITY_SUBSTITUTIONS = [ + (u'---', u'â'), + (u'--', u'â'), + (u'...', u'â¦'), + (u',,', u'â'), + (u'"', u'â'), +] + + +def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + + +# Register substitute_entities function with lxml +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['substitute_entities'] = substitute_entities + + +def transform(input_filename, output_filename): + """Transforms file input_filename in XML to output_filename in XHTML.""" + # Parse XSLT + style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') + style = etree.parse(style_filename) + + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'
\n', line) + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0); + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') + + class Fragment(object): def __init__(self, id, themes): super(Fragment, self).__init__() self.id = id self.themes = themes self.events = [] - + def append(self, event, element): self.events.append((event, element)) - + def closed_events(self): stack = [] for event, element in self.events: @@ -27,10 +76,10 @@ class Fragment(object): stack.pop() except IndexError: print 'CLOSED NON-OPEN TAG:', element - + stack.reverse() return self.events + stack - + def to_string(self): result = [] for event, element in self.closed_events(): @@ -44,9 +93,9 @@ class Fragment(object): result.append(element.tail) else: result.append(element) - + return ''.join(result) - + def __unicode__(self): return self.to_string() @@ -55,28 +104,28 @@ def extract_fragments(input_filename): """Extracts theme fragments from input_filename.""" open_fragments = {} closed_fragments = {} - + for event, element in etree.iterparse(input_filename, events=('start', 'end')): # Process begin and end elements if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): if not event == 'end': continue # Process elements only once, on end event - + # Open new fragment if element.get('class', '') == 'theme-begin': fragment = Fragment(id=element.get('fid'), themes=element.text) - + # Append parents if element.getparent().tag != 'body': parents = [element.getparent()] while parents[-1].getparent().tag != 'body': parents.append(parents[-1].getparent()) - + parents.reverse() for parent in parents: fragment.append('start', parent) - + open_fragments[fragment.id] = fragment - + # Close existing fragment else: try: @@ -86,13 +135,13 @@ def extract_fragments(input_filename): else: closed_fragments[fragment.id] = fragment del open_fragments[fragment.id] - + # Append element tail to lost_text (we don't want to lose any text) if element.tail: for fragment_id in open_fragments: open_fragments[fragment_id].append('text', element.tail) - - + + # Process all elements except begin and end else: # Omit annotation tags @@ -103,50 +152,6 @@ def extract_fragments(input_filename): else: for fragment_id in open_fragments: open_fragments[fragment_id].append(event, copy.copy(element)) - - return closed_fragments, open_fragments - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Extract theme fragments from SOURCE.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' - - closed_fragments, open_fragments = extract_fragments(input_filename) - - for fragment_id in open_fragments: - print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) - - output_file = open(output_filename, 'w') - output_file.write(""" - - - bookfragments output - - - - """) - for fragment in closed_fragments.values(): - html = u'

[#%s] %s

' % (fragment.id, fragment.themes, fragment) - output_file.write(html.encode('utf-8')) - output_file.write('') - output_file.close() + return closed_fragments, open_fragments -- 2.20.1