librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 import os
   3 import cStringIO
   4 import re
   5 import codecs
   6
   7 from lxml import etree
   8
   9 from librarian import dcparser
  10
  11
  12 ENTITY_SUBSTITUTIONS = [
  13     (u'---', u'—'),
  14     (u'--', u'–'),
  15     (u'...', u'…'),
  16     (u',,', u'„'),
  17     (u'"', u'”'),
  18 ]
  19
  20
  21 MAX_LINE_LENGTH = 80
  22
  23
  24 def strip(context, text):
  25     """Remove unneeded whitespace from beginning and end"""
  26     if isinstance(text, list):
  27         text = ''.join(text)
  28     return re.sub(r'\s+', ' ', text).strip()
  29
  30
  31 def substitute_entities(context, text):
  32     """XPath extension function converting all entites in passed text."""
  33     if isinstance(text, list):
  34         text = ''.join(text)
  35     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  36         text = text.replace(entity, substitutution)
  37     return text
  38
  39
  40 def wrap_words(context, text):
  41     """XPath extension function automatically wrapping words in passed text"""
  42     if isinstance(text, list):
  43         text = ''.join(text)
  44     words = re.split(r'\s', text)
  45
  46     line_length = 0
  47     lines = [[]]
  48     for word in words:
  49         line_length += len(word) + 1
  50         if line_length > MAX_LINE_LENGTH:
  51             # Max line length was exceeded. We create new line
  52             lines.append([])
  53             line_length = len(word)
  54         lines[-1].append(word)
  55     return '\n'.join(' '.join(line) for line in lines)
  56
  57
  58 # Register substitute_entities function with lxml
  59 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  60 ns['strip'] = strip
  61 ns['substitute_entities'] = substitute_entities
  62 ns['wrap_words'] = wrap_words
  63
  64
  65 def transform(input_filename, output_filename):
  66     """Transforms file input_filename in XML to output_filename in TXT."""
  67     # Parse XSLT
  68     style_filename = os.path.join(os.path.dirname(__file__), 'book2txt.xslt')
  69     style = etree.parse(style_filename)
  70
  71     doc_file = cStringIO.StringIO()
  72     expr = re.compile(r'/\s', re.MULTILINE | re.UNICODE);
  73
  74     f = open(input_filename, 'r')
  75     for line in f:
  76         line = line.decode('utf-8')
  77         line = expr.sub(u'<br/>\n', line)
  78         doc_file.write(line.encode('utf-8'))
  79     f.close()
  80
  81     doc_file.seek(0)
  82
  83     parser = etree.XMLParser(remove_blank_text=True)
  84     doc = etree.parse(doc_file, parser)
  85
  86     result = doc.xslt(style)
  87     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
  88     output_file.write(unicode(result) % dcparser.parse(input_filename).url)
  89