From 2903250a90904488ff3de04ecbbf2b6b5c421839 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20St=C4=99pniowski?= Date: Mon, 15 Sep 2008 15:49:49 +0200 Subject: [PATCH 1/1] Added book2txt.py to repository. --- lib/librarian/bin/book2txt.py | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 lib/librarian/bin/book2txt.py diff --git a/lib/librarian/bin/book2txt.py b/lib/librarian/bin/book2txt.py new file mode 100755 index 000000000..86f6f1274 --- /dev/null +++ b/lib/librarian/bin/book2txt.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import re +import os +import optparse +import codecs + + +REGEXES = [ + (r']*>(.|\n)*?', ''), + (r']*>(.|\n)*?', ''), + ('<(begin|end)\\sid=[\'|"][b|e]\\d+[\'|"]\\s/>', ''), + (r'(()|())', ''), + (r'[^<]*', ''), + (r'(.|\n)*?

', ''), + (r'<[^>]+>', ''), + (r'/$', ''), + (r'---', u'—'), + (r'--', u'-'), + (r',,', u'„'), + (r'"', u'”'), +] + + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Convert SOURCE files to TXT format.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.txt' + + xml = codecs.open(input_filename, 'r', encoding='utf-8').read() + for pattern, repl in REGEXES: + # print pattern, repl + xml, n = re.subn(pattern, repl, xml) + # print n + + output = codecs.open(output_filename, 'w', encoding='utf-8') + output.write(xml) + -- 2.20.1