librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska
   4 #
   5 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   6 # For full license text see COPYING or <http://www.gnu.org/licenses/agpl.html>
   7 #
   8 from librarian import dcparser, parser
   9 from lxml import etree
  10 import cStringIO
  11 import codecs
  12 import os
  13 import re
  14
  15
  16 ENTITY_SUBSTITUTIONS = [
  17     (u'---', u'—'),
  18     (u'--', u'–'),
  19     (u'...', u'…'),
  20     (u',,', u'„'),
  21     (u'"', u'”'),
  22 ]
  23
  24
  25 TEMPLATE = u"""\
  26 Kodowanie znaków w dokumencie: UTF-8.
  27 -----
  28 Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
  29 Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
  30 się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
  31
  32 Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s.
  33 -----
  34
  35
  36
  37 %(text)s
  38 """
  39
  40
  41 def strip(context, text):
  42     """Remove unneeded whitespace from beginning and end"""
  43     if isinstance(text, list):
  44         text = ''.join(text)
  45     return re.sub(r'\s+', ' ', text).strip()
  46
  47
  48 def substitute_entities(context, text):
  49     """XPath extension function converting all entites in passed text."""
  50     if isinstance(text, list):
  51         text = ''.join(text)
  52     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  53         text = text.replace(entity, substitutution)
  54     return text
  55
  56
  57 def wrap_words(context, text, wrapping):
  58     """XPath extension function automatically wrapping words in passed text"""
  59     if isinstance(text, list):
  60         text = ''.join(text)
  61     if not wrapping:
  62         return text
  63
  64     words = re.split(r'\s', text)
  65
  66     line_length = 0
  67     lines = [[]]
  68     for word in words:
  69         line_length += len(word) + 1
  70         if line_length > wrapping:
  71             # Max line length was exceeded. We create new line
  72             lines.append([])
  73             line_length = len(word)
  74         lines[-1].append(word)
  75     return '\n'.join(' '.join(line) for line in lines)
  76
  77
  78 # Register substitute_entities function with lxml
  79 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  80 ns['strip'] = strip
  81 ns['substitute_entities'] = substitute_entities
  82 ns['wrap_words'] = wrap_words
  83
  84
  85 def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
  86     """Transforms file input_filename in XML to output_filename in TXT."""
  87     # Parse XSLT
  88     style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
  89     style = etree.parse(style_filename)
  90
  91     if is_file:
  92         document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
  93     else:
  94         document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
  95
  96     result = document.transform(style, **options)
  97
  98     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
  99
 100     if parse_dublincore:
 101         url = dcparser.parse(input_filename).url
 102     else:
 103         url = '*' * 10
 104     output_file.write(TEMPLATE % {
 105         'url': url,
 106         'text': unicode(result),
 107     })
 108