librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 from librarian import dcparser, parser
   3 from lxml import etree
   4 import cStringIO
   5 import codecs
   6 import os
   7 import re
   8
   9
  10 ENTITY_SUBSTITUTIONS = [
  11     (u'---', u'—'),
  12     (u'--', u'–'),
  13     (u'...', u'…'),
  14     (u',,', u'„'),
  15     (u'"', u'”'),
  16 ]
  17
  18
  19 TEMPLATE = u"""\
  20 Kodowanie znaków w dokumencie: UTF-8.
  21 -----
  22 Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
  23 Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
  24 się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
  25
  26 Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s.
  27 -----
  28
  29
  30
  31 %(text)s
  32 """
  33
  34
  35 def strip(context, text):
  36     """Remove unneeded whitespace from beginning and end"""
  37     if isinstance(text, list):
  38         text = ''.join(text)
  39     return re.sub(r'\s+', ' ', text).strip()
  40
  41
  42 def substitute_entities(context, text):
  43     """XPath extension function converting all entites in passed text."""
  44     if isinstance(text, list):
  45         text = ''.join(text)
  46     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  47         text = text.replace(entity, substitutution)
  48     return text
  49
  50
  51 def wrap_words(context, text, wrapping):
  52     """XPath extension function automatically wrapping words in passed text"""
  53     if isinstance(text, list):
  54         text = ''.join(text)
  55     if not wrapping:
  56         return text
  57
  58     words = re.split(r'\s', text)
  59
  60     line_length = 0
  61     lines = [[]]
  62     for word in words:
  63         line_length += len(word) + 1
  64         if line_length > wrapping:
  65             # Max line length was exceeded. We create new line
  66             lines.append([])
  67             line_length = len(word)
  68         lines[-1].append(word)
  69     return '\n'.join(' '.join(line) for line in lines)
  70
  71
  72 # Register substitute_entities function with lxml
  73 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  74 ns['strip'] = strip
  75 ns['substitute_entities'] = substitute_entities
  76 ns['wrap_words'] = wrap_words
  77
  78
  79 def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
  80     """Transforms file input_filename in XML to output_filename in TXT."""
  81     # Parse XSLT
  82     style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
  83     style = etree.parse(style_filename)
  84
  85     if is_file:
  86         document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
  87     else:
  88         document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
  89
  90     result = document.transform(style, **options)
  91
  92     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
  93
  94     if parse_dublincore:
  95         url = dcparser.parse(input_filename).url
  96     else:
  97         url = '*' * 10
  98     output_file.write(TEMPLATE % {
  99         'url': url,
 100         'text': unicode(result),
 101     })
 102