1 # -*- coding: utf-8 -*-
9 from librarian import dcparser
12 ENTITY_SUBSTITUTIONS = [
24 def strip(context, text):
25 """Remove unneeded whitespace from beginning and end"""
26 if isinstance(text, list):
28 return re.sub(r'\s+', ' ', text).strip()
31 def substitute_entities(context, text):
32 """XPath extension function converting all entites in passed text."""
33 if isinstance(text, list):
35 for entity, substitutution in ENTITY_SUBSTITUTIONS:
36 text = text.replace(entity, substitutution)
40 def wrap_words(context, text):
41 """XPath extension function automatically wrapping words in passed text"""
42 if isinstance(text, list):
44 words = re.split(r'\s', text)
49 line_length += len(word) + 1
50 if line_length > MAX_LINE_LENGTH:
51 # Max line length was exceeded. We create new line
53 line_length = len(word)
54 lines[-1].append(word)
55 return '\n'.join(' '.join(line) for line in lines)
58 # Register substitute_entities function with lxml
59 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
61 ns['substitute_entities'] = substitute_entities
62 ns['wrap_words'] = wrap_words
65 def transform(input_filename, output_filename):
66 """Transforms file input_filename in XML to output_filename in TXT."""
68 style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt')
69 style = etree.parse(style_filename)
71 doc_file = cStringIO.StringIO()
72 expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
74 f = open(input_filename, 'r')
76 line = line.decode('utf-8')
77 line = expr.sub(u'<br/>\n', line)
78 doc_file.write(line.encode('utf-8'))
83 parser = etree.XMLParser(remove_blank_text=True)
84 doc = etree.parse(doc_file, parser)
86 result = doc.xslt(style)
87 output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
88 output_file.write(unicode(result) % dcparser.parse(input_filename).url)