From: Marek Stępniowski Date: Wed, 3 Sep 2008 16:24:30 +0000 (+0200) Subject: Moved book2html and bookfragments into its own librarian package. X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/648dff98814d2ddeda2f06a8c0e2fb01547fac2a Moved book2html and bookfragments into its own librarian package. --- diff --git a/bin/book2html.py b/bin/book2html.py deleted file mode 100755 index 3907de38c..000000000 --- a/bin/book2html.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import cStringIO -import re -import optparse -import os -import sys - -from lxml import etree - - -ENTITY_SUBSTITUTIONS = [ - (u'---', u'—'), - (u'--', u'–'), - (u'...', u'…'), - (u',,', u'„'), - (u'"', u'”'), -] - - -def substitute_entities(context, text): - """XPath extension function converting all entites in passed text.""" - if isinstance(text, list): - text = ''.join(text) - for entity, substitutution in ENTITY_SUBSTITUTIONS: - text = text.replace(entity, substitutution) - return text - - -# Register substitute_entities function with lxml -ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') -ns['substitute_entities'] = substitute_entities - - -def transform(input_filename, output_filename): - """Transforms file input_filename in XML to output_filename in XHTML.""" - # Parse XSLT - style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') - style = etree.parse(style_filename) - - doc_file = cStringIO.StringIO() - expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); - - f = open(input_filename, 'r') - for line in f: - line = line.decode('utf-8') - line = expr.sub(u'
\n', line) - doc_file.write(line.encode('utf-8')) - f.close() - - doc_file.seek(0); - - parser = etree.XMLParser(remove_blank_text=True) - doc = etree.parse(doc_file, parser) - - result = doc.xslt(style) - result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Convert SOURCE files to HTML format.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.html' - transform(input_filename, output_filename) - diff --git a/bin/book2html.xslt b/bin/book2html.xslt deleted file mode 100644 index f52d83089..000000000 --- a/bin/book2html.xslt +++ /dev/null @@ -1,200 +0,0 @@ - - - - - - - - - - - - - - - - book2html output - - - - - - -
-

Przypisy

- -
- - [] - - -

-
- - - -
-
-
-
-
- - -
- - - - -

- -

-
- -
- - - - - - - - - - - - - - - - - - - - - -
-
- - -

-
- - -

-
- - -

-
- - -
- -
-
- - -
-
- - -
-

-
    - -
-
-
- - -
  • -
    - - - - - - - - - - - - - - - - - - - -

    - - - indent: 1em - - - indent: 2em - - - -

    -
    - - - - [] - - - -
    - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - -

    -
    - - - - -
    -
    - - -

    -
    - - -

    -
    - - -
    -
    - -
    diff --git a/bin/bookfragments.py b/bin/bookfragments.py deleted file mode 100755 index ea2b0b7b7..000000000 --- a/bin/bookfragments.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import optparse -import os -import copy - -from lxml import etree - - -class Fragment(object): - def __init__(self, id, themes): - super(Fragment, self).__init__() - self.id = id - self.themes = themes - self.events = [] - - def append(self, event, element): - self.events.append((event, element)) - - def closed_events(self): - stack = [] - for event, element in self.events: - if event == 'start': - stack.append(('end', element)) - elif event == 'end': - try: - stack.pop() - except IndexError: - print 'CLOSED NON-OPEN TAG:', element - - stack.reverse() - return self.events + stack - - def to_string(self): - result = [] - for event, element in self.closed_events(): - if event == 'start': - result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) - if element.text: - result.append(element.text) - elif event == 'end': - result.append(u'' % element.tag) - if element.tail: - result.append(element.tail) - else: - result.append(element) - - return ''.join(result) - - def __unicode__(self): - return self.to_string() - - -def extract_fragments(input_filename): - """Extracts theme fragments from input_filename.""" - open_fragments = {} - closed_fragments = {} - - for event, element in etree.iterparse(input_filename, events=('start', 'end')): - # Process begin and end elements - if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): - if not event == 'end': continue # Process elements only once, on end event - - # Open new fragment - if element.get('class', '') == 'theme-begin': - fragment = Fragment(id=element.get('fid'), themes=element.text) - - # Append parents - if element.getparent().tag != 'body': - parents = [element.getparent()] - while parents[-1].getparent().tag != 'body': - parents.append(parents[-1].getparent()) - - parents.reverse() - for parent in parents: - fragment.append('start', parent) - - open_fragments[fragment.id] = fragment - - # Close existing fragment - else: - try: - fragment = open_fragments[element.get('fid')] - except KeyError: - print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) - else: - closed_fragments[fragment.id] = fragment - del open_fragments[fragment.id] - - # Append element tail to lost_text (we don't want to lose any text) - if element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - - - # Process all elements except begin and end - else: - # Omit annotation tags - if len(element.get('name', '')) or element.get('class', '') == 'annotation': - if event == 'end' and element.tail: - for fragment_id in open_fragments: - open_fragments[fragment_id].append('text', element.tail) - else: - for fragment_id in open_fragments: - open_fragments[fragment_id].append(event, copy.copy(element)) - - return closed_fragments, open_fragments - - -if __name__ == '__main__': - # Parse commandline arguments - usage = """Usage: %prog [options] SOURCE [SOURCE...] - Extract theme fragments from SOURCE.""" - - parser = optparse.OptionParser(usage=usage) - - parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - - options, input_filenames = parser.parse_args() - - if len(input_filenames) < 1: - parser.print_help() - exit(1) - - # Do some real work - for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' - - closed_fragments, open_fragments = extract_fragments(input_filename) - - for fragment_id in open_fragments: - print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) - - output_file = open(output_filename, 'w') - output_file.write(""" - - - bookfragments output - - - - """) - for fragment in closed_fragments.values(): - html = u'

    [#%s] %s

    %s
    ' % (fragment.id, fragment.themes, fragment) - output_file.write(html.encode('utf-8')) - output_file.write('') - output_file.close() - diff --git a/bin/master.css b/bin/master.css deleted file mode 100644 index f07e41cef..000000000 --- a/bin/master.css +++ /dev/null @@ -1,154 +0,0 @@ -body { - font-size: 16px; - font: Georgia, "Times New Roman", serif; - line-height: 1.5em; - margin: 3em; - max-width: 36em; -} - -a { - color: blue; - text-decoration: none; -} - -/* =================================================== */ -/* = Common elements: headings, paragraphs and lines = */ -/* =================================================== */ -h1 { - font-size: 3em; - margin: 1.5em 0; - text-align: center; - line-height: 1.5em; - font-weight: bold; -} - -h2 { - font-size: 2em; - margin: 1.5em 0 0; - font-weight: bold; - line-height: 1.5em; -} - -h3 { - font-size: 1.5em; - margin: 1.5em 0 0; - font-weight: normal; - line-height: 1.5em; -} - -h4 { - font-size: 1em; - margin: 1.5em 0 0; - line-height: 1.5em; -} - -p { - margin: 0; -} - -/* ======================== */ -/* = Footnotes and themes = */ -/* ======================== */ -.theme-begin { - border-left: 0.1em solid #DDDDDD; - color: #666; - float: right; - margin: 0 -9.5em 0 0; - padding: 0 0.5em; - width: 7.5em; - font-style: normal; - font-weight: normal; - font-size: 16px; -} - -.annotation { - font-style: normal; - font-weight: normal; - font-size: 16px; -} - -#footnotes .annotation { - display: block; - float: left; - width: 2.5em; - clear: both; -} - -#footnotes div { - margin: 1.5em 0 0 0; -} - -#footnotes p { - margin-left: 2.5em; -} - -/* =================== */ -/* = Custom elements = */ -/* =================== */ -span.author { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: 0.25em; -} - -span.collection { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-bottom: -0.25em; -} - -span.subtitle { - font-size: 0.75em; - display: block; - line-height: 1.5em; - margin-top: -0.25em; -} - -div.didaskalia { - font-style: italic; - margin: 0.5em 0 0; -} - -div.kwestia { - margin: 0.5em 0 0; -} - -div.stanza { - margin: 1.5em 0 0; -} - -div.kwestia div.stanza { - margin: 0; -} - -p.paragraph { - text-align: justify; - margin: 1.5em 0 0; -} - -p.motto { - text-align: justify; - font-style: italic; - margin: 1.5em 0 0; -} - -p.motto_podpis { - font-size: 0.875em; -} - -div.fragment { - border-bottom: 0.1em solid #999; - padding-bottom: 1.5em; -} - -div.note p, div.note p.paragraph { - text-align: right; - font-style: italic; -} - -hr.spacer { - height: 3em; - visibility: hidden; -} diff --git a/lib/librarian/__init__.py b/lib/librarian/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lib/librarian/bin/book2html.py b/lib/librarian/bin/book2html.py new file mode 100755 index 000000000..a0229bbe7 --- /dev/null +++ b/lib/librarian/bin/book2html.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to HTML format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.html' + html.transform(input_filename, output_filename) + diff --git a/lib/librarian/bin/bookfragments.py b/lib/librarian/bin/bookfragments.py new file mode 100755 index 000000000..f29e11e02 --- /dev/null +++ b/lib/librarian/bin/bookfragments.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +import os +import optparse + +from librarian import html + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Extract theme fragments from SOURCE.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + closed_fragments, open_fragments = html.extract_fragments(input_filename) + + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + + output_file = open(output_filename, 'w') + output_file.write(""" + + + bookfragments output + + + + """) + for fragment in closed_fragments.values(): + fragment_html = u'

    [#%s] %s

    %s
    ' % (fragment.id, fragment.themes, fragment) + output_file.write(fragment_html.encode('utf-8')) + output_file.write('') + output_file.close() + diff --git a/lib/librarian/bin/master.css b/lib/librarian/bin/master.css new file mode 100644 index 000000000..f07e41cef --- /dev/null +++ b/lib/librarian/bin/master.css @@ -0,0 +1,154 @@ +body { + font-size: 16px; + font: Georgia, "Times New Roman", serif; + line-height: 1.5em; + margin: 3em; + max-width: 36em; +} + +a { + color: blue; + text-decoration: none; +} + +/* =================================================== */ +/* = Common elements: headings, paragraphs and lines = */ +/* =================================================== */ +h1 { + font-size: 3em; + margin: 1.5em 0; + text-align: center; + line-height: 1.5em; + font-weight: bold; +} + +h2 { + font-size: 2em; + margin: 1.5em 0 0; + font-weight: bold; + line-height: 1.5em; +} + +h3 { + font-size: 1.5em; + margin: 1.5em 0 0; + font-weight: normal; + line-height: 1.5em; +} + +h4 { + font-size: 1em; + margin: 1.5em 0 0; + line-height: 1.5em; +} + +p { + margin: 0; +} + +/* ======================== */ +/* = Footnotes and themes = */ +/* ======================== */ +.theme-begin { + border-left: 0.1em solid #DDDDDD; + color: #666; + float: right; + margin: 0 -9.5em 0 0; + padding: 0 0.5em; + width: 7.5em; + font-style: normal; + font-weight: normal; + font-size: 16px; +} + +.annotation { + font-style: normal; + font-weight: normal; + font-size: 16px; +} + +#footnotes .annotation { + display: block; + float: left; + width: 2.5em; + clear: both; +} + +#footnotes div { + margin: 1.5em 0 0 0; +} + +#footnotes p { + margin-left: 2.5em; +} + +/* =================== */ +/* = Custom elements = */ +/* =================== */ +span.author { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: 0.25em; +} + +span.collection { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: -0.25em; +} + +span.subtitle { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-top: -0.25em; +} + +div.didaskalia { + font-style: italic; + margin: 0.5em 0 0; +} + +div.kwestia { + margin: 0.5em 0 0; +} + +div.stanza { + margin: 1.5em 0 0; +} + +div.kwestia div.stanza { + margin: 0; +} + +p.paragraph { + text-align: justify; + margin: 1.5em 0 0; +} + +p.motto { + text-align: justify; + font-style: italic; + margin: 1.5em 0 0; +} + +p.motto_podpis { + font-size: 0.875em; +} + +div.fragment { + border-bottom: 0.1em solid #999; + padding-bottom: 1.5em; +} + +div.note p, div.note p.paragraph { + text-align: right; + font-style: italic; +} + +hr.spacer { + height: 3em; + visibility: hidden; +} diff --git a/lib/librarian/bin/master.plain.css b/lib/librarian/bin/master.plain.css new file mode 100644 index 000000000..3210e8819 --- /dev/null +++ b/lib/librarian/bin/master.plain.css @@ -0,0 +1,160 @@ +body { + font-size: 16px; + font: Georgia, "Times New Roman", serif; + line-height: 1.5em; + margin: 3em; + max-width: 36em; +} + +a { + color: blue; + text-decoration: none; +} + +/* =================================================== */ +/* = Common elements: headings, paragraphs and lines = */ +/* =================================================== */ +h1 { + font-size: 3em; + margin: 1.5em 0; + text-align: center; + line-height: 1.5em; + font-weight: bold; +} + +h2 { + font-size: 2em; + margin: 1.5em 0 0; + font-weight: bold; + line-height: 1.5em; +} + +h3 { + font-size: 1.5em; + margin: 1.5em 0 0; + font-weight: normal; + line-height: 1.5em; +} + +h4 { + font-size: 1em; + margin: 1.5em 0 0; + line-height: 1.5em; +} + +p { + margin: 0; +} + +/* ======================== */ +/* = Footnotes and themes = */ +/* ======================== */ +.theme-begin { + border-left: 0.1em solid #DDDDDD; + color: #666; + float: right; + margin: 0 -9.5em 0 0; + padding: 0 0.5em; + width: 7.5em; + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +.annotation { + font-style: normal; + font-weight: normal; + font-size: 16px; + display: none; +} + +#footnotes { + display: none; +} + +#footnotes .annotation { + display: block; + float: left; + width: 2.5em; + clear: both; +} + +#footnotes div { + margin: 1.5em 0 0 0; +} + +#footnotes p { + margin-left: 2.5em; +} + +/* =================== */ +/* = Custom elements = */ +/* =================== */ +span.author { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: 0.25em; +} + +span.collection { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-bottom: -0.25em; +} + +span.subtitle { + font-size: 0.75em; + display: block; + line-height: 1.5em; + margin-top: -0.25em; +} + +div.didaskalia { + font-style: italic; + margin: 0.5em 0 0; +} + +div.kwestia { + margin: 0.5em 0 0; +} + +div.stanza { + margin: 1.5em 0 0; +} + +div.kwestia div.stanza { + margin: 0; +} + +p.paragraph { + text-align: justify; + margin: 1.5em 0 0; +} + +p.motto { + text-align: justify; + font-style: italic; + margin: 1.5em 0 0; +} + +p.motto_podpis { + font-size: 0.875em; +} + +div.fragment { + border-bottom: 0.1em solid #999; + padding-bottom: 1.5em; +} + +div.note p, div.note p.paragraph { + text-align: right; + font-style: italic; +} + +hr.spacer { + height: 3em; + visibility: hidden; +} diff --git a/lib/librarian/book2html.xslt b/lib/librarian/book2html.xslt new file mode 100644 index 000000000..f52d83089 --- /dev/null +++ b/lib/librarian/book2html.xslt @@ -0,0 +1,200 @@ + + + + + + + + + + + + + + + + book2html output + + + + + + +
    +

    Przypisy

    + +
    + + [] + + +

    +
    + + + +
    +
    +
    +
    +
    + + +
    + + + + +

    + +

    +
    + +
    + + + + + + + + + + + + + + + + + + + + + +
    +
    + + +

    +
    + + +

    +
    + + +

    +
    + + +
    + +
    +
    + + +
    +
    + + +
    +

    +
      + +
    +
    +
    + + +
  • +
    + + + + + + + + + + + + + + + + + + + +

    + + + indent: 1em + + + indent: 2em + + + +

    +
    + + + + [] + + + +
    + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + +

    +
    + + + + +
    +
    + + +

    +
    + + +

    +
    + + +
    +
    + +
    diff --git a/lib/librarian/html.py b/lib/librarian/html.py new file mode 100644 index 000000000..ae5efa644 --- /dev/null +++ b/lib/librarian/html.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +import os +import cStringIO +import re +import copy +import pkgutil + +from lxml import etree + + +ENTITY_SUBSTITUTIONS = [ + (u'---', u'—'), + (u'--', u'–'), + (u'...', u'…'), + (u',,', u'„'), + (u'"', u'”'), +] + + +def substitute_entities(context, text): + """XPath extension function converting all entites in passed text.""" + if isinstance(text, list): + text = ''.join(text) + for entity, substitutution in ENTITY_SUBSTITUTIONS: + text = text.replace(entity, substitutution) + return text + + +# Register substitute_entities function with lxml +ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') +ns['substitute_entities'] = substitute_entities + + +def transform(input_filename, output_filename): + """Transforms file input_filename in XML to output_filename in XHTML.""" + # Parse XSLT + style_filename = os.path.join(os.path.dirname(__file__), 'book2html.xslt') + style = etree.parse(style_filename) + + doc_file = cStringIO.StringIO() + expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE); + + f = open(input_filename, 'r') + for line in f: + line = line.decode('utf-8') + line = expr.sub(u'
    \n', line) + doc_file.write(line.encode('utf-8')) + f.close() + + doc_file.seek(0); + + parser = etree.XMLParser(remove_blank_text=True) + doc = etree.parse(doc_file, parser) + + result = doc.xslt(style) + result.write(output_filename, xml_declaration=True, pretty_print=True, encoding='utf-8') + + +class Fragment(object): + def __init__(self, id, themes): + super(Fragment, self).__init__() + self.id = id + self.themes = themes + self.events = [] + + def append(self, event, element): + self.events.append((event, element)) + + def closed_events(self): + stack = [] + for event, element in self.events: + if event == 'start': + stack.append(('end', element)) + elif event == 'end': + try: + stack.pop() + except IndexError: + print 'CLOSED NON-OPEN TAG:', element + + stack.reverse() + return self.events + stack + + def to_string(self): + result = [] + for event, element in self.closed_events(): + if event == 'start': + result.append(u'<%s %s>' % (element.tag, ' '.join('%s="%s"' % (k, v) for k, v in element.attrib.items()))) + if element.text: + result.append(element.text) + elif event == 'end': + result.append(u'' % element.tag) + if element.tail: + result.append(element.tail) + else: + result.append(element) + + return ''.join(result) + + def __unicode__(self): + return self.to_string() + + +def extract_fragments(input_filename): + """Extracts theme fragments from input_filename.""" + open_fragments = {} + closed_fragments = {} + + for event, element in etree.iterparse(input_filename, events=('start', 'end')): + # Process begin and end elements + if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): + if not event == 'end': continue # Process elements only once, on end event + + # Open new fragment + if element.get('class', '') == 'theme-begin': + fragment = Fragment(id=element.get('fid'), themes=element.text) + + # Append parents + if element.getparent().tag != 'body': + parents = [element.getparent()] + while parents[-1].getparent().tag != 'body': + parents.append(parents[-1].getparent()) + + parents.reverse() + for parent in parents: + fragment.append('start', parent) + + open_fragments[fragment.id] = fragment + + # Close existing fragment + else: + try: + fragment = open_fragments[element.get('fid')] + except KeyError: + print '%s:closed not open fragment #%s' % (input_filename, element.get('fid')) + else: + closed_fragments[fragment.id] = fragment + del open_fragments[fragment.id] + + # Append element tail to lost_text (we don't want to lose any text) + if element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + + + # Process all elements except begin and end + else: + # Omit annotation tags + if len(element.get('name', '')) or element.get('class', '') == 'annotation': + if event == 'end' and element.tail: + for fragment_id in open_fragments: + open_fragments[fragment_id].append('text', element.tail) + else: + for fragment_id in open_fragments: + open_fragments[fragment_id].append(event, copy.copy(element)) + + return closed_fragments, open_fragments +