bin/book2html.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 import cStringIO
   4 import re
   5 import optparse
   6 import os
   7 import sys
   8
   9 from lxml import etree
  10
  11
  12 ENTITY_SUBSTITUTIONS = [
  13     (u'---', u'—'),
  14     (u'--', u'–'),
  15     (u'...', u'…'),
  16     (u',,', u'„'),
  17     (u'"', u'”'),
  18 ]
  19
  20
  21 def substitute_entities(context, text):
  22     """XPath extension function converting all entites in passed text."""
  23     if isinstance(text, list):
  24         text = ''.join(text)
  25     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  26         text = text.replace(entity, substitutution)
  27     return text
  28
  29
  30 # Register substitute_entities function with lxml
  31 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  32 ns['substitute_entities'] = substitute_entities
  33
  34
  35 def transform(input_filename, output_filename):
  36     """Transforms file input_filename in XML to output_filename in XHTML."""
  37     # Parse XSLT
  38     style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt')
  39     style = etree.parse(style_filename)
  40
  41     doc_file = cStringIO.StringIO()
  42     expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
  43
  44     f = open(input_filename, 'r')
  45     for line in f:
  46         line = line.decode('utf-8')
  47         line = expr.sub(u'<br/>\n', line)
  48         doc_file.write(line.encode('utf-8'))
  49     f.close()
  50
  51     doc_file.seek(0);
  52
  53     parser = etree.XMLParser(remove_blank_text=True)
  54     doc = etree.parse(doc_file, parser)
  55
  56     result = doc.xslt(style)
  57     result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8')
  58
  59
  60 if __name__ == '__main__':
  61     # Parse commandline arguments
  62     usage = """Usage: %prog [options] SOURCE [SOURCE...]
  63     Convert SOURCE files to HTML format."""
  64
  65     parser = optparse.OptionParser(usage=usage)
  66
  67     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
  68         help='print status messages to stdout')
  69
  70     options, input_filenames = parser.parse_args()
  71
  72     if len(input_filenames) < 1:
  73         parser.print_help()
  74         exit(1)
  75
  76     # Do some real work
  77     for input_filename in input_filenames:
  78         if options.verbose:
  79             print input_filename
  80
  81         output_filename = os.path.splitext(input_filename)[0] + '.html'
  82         transform(input_filename, output_filename)
  83