X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/239d66922f4b83ee5baaa284a9c33a32bfcb99a4..0a471a47650228366fa66c09ecdff056d10852ce:/bin/bookfragments.py diff --git a/bin/bookfragments.py b/bin/bookfragments.py index 73d271116..ea2b0b7b7 100755 --- a/bin/bookfragments.py +++ b/bin/bookfragments.py @@ -7,22 +7,6 @@ import copy from lxml import etree -# Parse args -usage = """Usage: %prog [options] SOURCE [SOURCE...] -Extract theme fragments from SOURCE.""" - -parser = optparse.OptionParser(usage=usage) - -parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='print status messages to stdout') - -options, input_filenames = parser.parse_args() - -if len(input_filenames) < 1: - parser.print_help() - exit(1) - - class Fragment(object): def __init__(self, id, themes): super(Fragment, self).__init__() @@ -67,19 +51,12 @@ class Fragment(object): return self.to_string() -# Do some real work -for input_filename in input_filenames: - if options.verbose: - print input_filename - - output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' - +def extract_fragments(input_filename): + """Extracts theme fragments from input_filename.""" open_fragments = {} closed_fragments = {} - lost_text = [] for event, element in etree.iterparse(input_filename, events=('start', 'end')): - # Process begin and end elements if element.tag == 'span' and element.get('class', '') in ('theme-begin', 'theme-end'): if not event == 'end': continue # Process elements only once, on end event @@ -126,23 +103,50 @@ for input_filename in input_filenames: else: for fragment_id in open_fragments: open_fragments[fragment_id].append(event, copy.copy(element)) + + return closed_fragments, open_fragments + +if __name__ == '__main__': + # Parse commandline arguments + usage = """Usage: %prog [options] SOURCE [SOURCE...] + Extract theme fragments from SOURCE.""" + + parser = optparse.OptionParser(usage=usage) + + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help='print status messages to stdout') + + options, input_filenames = parser.parse_args() + + if len(input_filenames) < 1: + parser.print_help() + exit(1) + + # Do some real work + for input_filename in input_filenames: + if options.verbose: + print input_filename + + output_filename = os.path.splitext(input_filename)[0] + '.fragments.html' + + closed_fragments, open_fragments = extract_fragments(input_filename) - for fragment_id in open_fragments: - print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) + for fragment_id in open_fragments: + print '%s:warning:unclosed fragment #%s' % (input_filename, fragment_id) - output_file = open(output_filename, 'w') - output_file.write(""" - - - bookfragments output - - - - """) - for fragment in closed_fragments.values(): - html = u'

[#%s] %s

%s
' % (fragment.id, fragment.themes, fragment) - output_file.write(html.encode('utf-8')) - output_file.write('') - output_file.close() + output_file = open(output_filename, 'w') + output_file.write(""" + + + bookfragments output + + + + """) + for fragment in closed_fragments.values(): + html = u'

[#%s] %s

%s
' % (fragment.id, fragment.themes, fragment) + output_file.write(html.encode('utf-8')) + output_file.write('') + output_file.close()