librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 #    This file is part of Librarian.
   4 #
   5 #    Copyright © 2008,2009,2010 Fundacja Nowoczesna Polska <fundacja@nowoczesnapolska.org.pl>
   6 #
   7 #    For full list of contributors see AUTHORS file.
   8 #
   9 #    This program is free software: you can redistribute it and/or modify
  10 #    it under the terms of the GNU Affero General Public License as published by
  11 #    the Free Software Foundation, either version 3 of the License, or
  12 #    (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU Affero General Public License for more details.
  18 #
  19 #    You should have received a copy of the GNU Affero General Public License
  20 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 #
  22 from librarian import dcparser, parser
  23 from lxml import etree
  24 import cStringIO
  25 import codecs
  26 import os
  27 import re
  28
  29
  30 ENTITY_SUBSTITUTIONS = [
  31     (u'---', u'—'),
  32     (u'--', u'–'),
  33     (u'...', u'…'),
  34     (u',,', u'„'),
  35     (u'"', u'”'),
  36 ]
  37
  38
  39 TEMPLATE = u"""\
  40 Kodowanie znaków w dokumencie: UTF-8.
  41 -----
  42 Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
  43 Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
  44 się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
  45
  46 Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s.
  47 -----
  48
  49
  50
  51 %(text)s
  52 """
  53
  54
  55 def strip(context, text):
  56     """Remove unneeded whitespace from beginning and end"""
  57     if isinstance(text, list):
  58         text = ''.join(text)
  59     return re.sub(r'\s+', ' ', text).strip()
  60
  61
  62 def substitute_entities(context, text):
  63     """XPath extension function converting all entites in passed text."""
  64     if isinstance(text, list):
  65         text = ''.join(text)
  66     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  67         text = text.replace(entity, substitutution)
  68     return text
  69
  70
  71 def wrap_words(context, text, wrapping):
  72     """XPath extension function automatically wrapping words in passed text"""
  73     if isinstance(text, list):
  74         text = ''.join(text)
  75     if not wrapping:
  76         return text
  77
  78     words = re.split(r'\s', text)
  79
  80     line_length = 0
  81     lines = [[]]
  82     for word in words:
  83         line_length += len(word) + 1
  84         if line_length > wrapping:
  85             # Max line length was exceeded. We create new line
  86             lines.append([])
  87             line_length = len(word)
  88         lines[-1].append(word)
  89     return '\n'.join(' '.join(line) for line in lines)
  90
  91
  92 # Register substitute_entities function with lxml
  93 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  94 ns['strip'] = strip
  95 ns['substitute_entities'] = substitute_entities
  96 ns['wrap_words'] = wrap_words
  97
  98
  99 def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
 100     """Transforms file input_filename in XML to output_filename in TXT."""
 101     # Parse XSLT
 102     style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
 103     style = etree.parse(style_filename)
 104
 105     if is_file:
 106         document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
 107     else:
 108         document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
 109
 110     result = document.transform(style, **options)
 111
 112     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
 113
 114     if parse_dublincore:
 115         url = dcparser.parse(input_filename).url
 116     else:
 117         url = '*' * 10
 118     output_file.write(TEMPLATE % {
 119         'url': url,
 120         'text': unicode(result),
 121     })
 122