1 # -*- coding: utf-8 -*-
9 from librarian import dcparser
12 ENTITY_SUBSTITUTIONS = [
24 def substitute_entities(context, text):
25 """XPath extension function converting all entites in passed text."""
26 if isinstance(text, list):
28 for entity, substitutution in ENTITY_SUBSTITUTIONS:
29 text = text.replace(entity, substitutution)
33 def wrap_words(context, text):
34 """XPath extension function automatically wrapping words in passed text"""
35 if isinstance(text, list):
37 words = re.split(r'\s', text)
42 line_length += len(word) + 1
43 if line_length > MAX_LINE_LENGTH:
44 # Max line length was exceeded. We create new line
46 line_length = len(word)
47 lines[-1].append(word)
48 return '\n'.join(' '.join(line) for line in lines)
51 # Register substitute_entities function with lxml
52 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
53 ns['substitute_entities'] = substitute_entities
54 ns['wrap_words'] = wrap_words
57 def transform(input_filename, output_filename):
58 """Transforms file input_filename in XML to output_filename in TXT."""
60 style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt')
61 style = etree.parse(style_filename)
63 doc_file = cStringIO.StringIO()
64 expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
66 f = open(input_filename, 'r')
68 line = line.decode('utf-8')
69 line = expr.sub(u'<br/>\n', line)
70 doc_file.write(line.encode('utf-8'))
75 parser = etree.XMLParser(remove_blank_text=True)
76 doc = etree.parse(doc_file, parser)
78 result = doc.xslt(style)
79 output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
80 output_file.write(unicode(result) % dcparser.parse(input_filename).url)