From: Marek Stępniowski Date: Thu, 28 Aug 2008 12:28:07 +0000 (+0200) Subject: Added book2html and bookfragments utilities to repository. X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/239d66922f4b83ee5baaa284a9c33a32bfcb99a4?ds=inline Added book2html and bookfragments utilities to repository. --- diff --git a/bin/book2html.py b/bin/book2html.py new file mode 100755 index 000000000..5bd2bb527 --- /dev/null +++ b/bin/book2html.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import cStringIO +import re +import optparse +import os +import sys + +from lxml import etree + + +# Parse args +usage = """Usage: %prog [options] SOURCE [SOURCE...] +Convert SOURCE files to HTML format.""" + +parser = optparse.OptionParser(usage=usage) + +parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + +options, input_filenames = parser.parse_args() + +if len(input_filenames) < 1: + parser.print_help() + exit(1) + +# Parse XSLT +style = etree.parse('book2html.xslt') + +# Do some real work +for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + + # Transform + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'
\n', line).replace(u'---', u'—').replace(u',,', u'„') + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0); + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') + diff --git a/bin/book2html.xslt b/bin/book2html.xslt new file mode 100644 index 000000000..6cba758c0 --- /dev/null +++ b/bin/book2html.xslt @@ -0,0 +1,150 @@ + + + + + + + + + + + + + + book2html output + + + + + +
+

Przypisy

+ +
+ + [] + + +

+
+ + + +
+
+
+
+ + +
+ + +

+
+ + +

+
+ + +

+
+ + +
+ +
+
+ + +
+
+ + +
+

+
    + +
+
+
+ + +
  • +
    + + + + + + + + + + + + + + + + + + + +

    + + + indent: 1em + + + indent: 2em + + + +

    +
    + + + + [] + + + +
    + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + +

    +
    + + + + +
    +
    + +
    \ No newline at end of file diff --git a/bin/bookfragments.py b/bin/bookfragments.py new file mode 100755 index 000000000..73d271116 --- /dev/null +++ b/bin/bookfragments.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import optparse +import os +import copy + +from lxml import etree + + +# Parse args +usage = """Usage: %prog [options] SOURCE [SOURCE...] +Extract theme fragments from SOURCE.""" + +parser = optparse.OptionParser(usage=usage) + +parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + +options, input_filenames = parser.parse_args() + +if len(input_filenames) < 1: + parser.print_help() + exit(1) + + +class Fragment(object): + def __init__(self, id, themes): + super(Fragment, self).__init__() + self.id = id + self.themes = themes + self.events = [] + + def append(self, event, element): + self.events.append((event, element)) + + def closed_events(self): + stack = [] + for event, element in self.events: + if event == 'start': + stack.append(('end', element)) + elif event == 'end': + try: + stack.pop() + except IndexError: + print 'CLOSED NON-OPEN TAG:', element + + stack.reverse() + return self.events + stack + + def to_string(self): + result = [] + for event, element in self.closed_events(): + if event == 'start': + result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) + if element.text: + result.append(element.text) + elif event == 'end': + result.append(u'' % element.tag) + if element.tail: + result.append(element.tail) + else: + result.append(element) + + return ''.join(result) + + def __unicode__(self): + return self.to_string() + + +# Do some real work +for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + open_fragments = {} + closed_fragments = {} + lost_text = [] + + for event, element in etree.iterparse(input_filename, events=('start', 'end')): + + # Process begin and end elements + if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): + if not event == 'end': continue # Process elements only once, on end event + + # Open new fragment + if element.get('class', '') == 'theme-begin': + fragment = Fragment(id=element.get('fid'), themes=element.text) + + # Append parents + if element.getparent().tag != 'body': + parents = [element.getparent()] + while parents[-1].getparent().tag != 'body': + parents.append(parents[-1].getparent()) + + parents.reverse() + for parent in parents: + fragment.append('start', parent) + + open_fragments[fragment.id] = fragment + + # Close existing fragment + else: + try: + fragment = open_fragments[element.get('fid')] + except KeyError: + print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) + else: + closed_fragments[fragment.id] = fragment + del open_fragments[fragment.id] + + # Append element tail to lost_text (we don't want to lose any text) + if element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + + + # Process all elements except begin and end + else: + # Omit annotation tags + if len(element.get('name', '')) or element.get('class', '') == 'annotation': + if event == 'end' and element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + else: + for fragment_id in open_fragments: + open_fragments[fragment_id].append(event, copy.copy(element)) + + + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + + output_file = open(output_filename, 'w') + output_file.write(""" + + + bookfragments output + + + + """) + for fragment in closed_fragments.values(): + html = u'

    [#%s] %s

    %s
    ' % (fragment.id, fragment.themes, fragment) + output_file.write(html.encode('utf-8')) + output_file.write('') + output_file.close() +