librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from librarian import dcparser, parser
   7 from lxml import etree
   8 import cStringIO
   9 import codecs
  10 import os
  11 import re
  12
  13
  14 ENTITY_SUBSTITUTIONS = [
  15     (u'---', u'—'),
  16     (u'--', u'–'),
  17     (u'...', u'…'),
  18     (u',,', u'„'),
  19     (u'"', u'”'),
  20 ]
  21
  22
  23 TEMPLATE = u"""\
  24 Kodowanie znaków w dokumencie: UTF-8.
  25 -----
  26 Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl/). Reprodukcja cyfrowa wykonana przez
  27 Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. Ten utwór nie jest chroniony prawem autorskim i znajduje
  28 się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać.
  29
  30 Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s.
  31 -----
  32
  33
  34
  35 %(text)s
  36 """
  37
  38
  39 def strip(context, text):
  40     """Remove unneeded whitespace from beginning and end"""
  41     if isinstance(text, list):
  42         text = ''.join(text)
  43     return re.sub(r'\s+', ' ', text).strip()
  44
  45
  46 def substitute_entities(context, text):
  47     """XPath extension function converting all entites in passed text."""
  48     if isinstance(text, list):
  49         text = ''.join(text)
  50     for entity, substitutution in ENTITY_SUBSTITUTIONS:
  51         text = text.replace(entity, substitutution)
  52     return text
  53
  54
  55 def wrap_words(context, text, wrapping):
  56     """XPath extension function automatically wrapping words in passed text"""
  57     if isinstance(text, list):
  58         text = ''.join(text)
  59     if not wrapping:
  60         return text
  61
  62     words = re.split(r'\s', text)
  63
  64     line_length = 0
  65     lines = [[]]
  66     for word in words:
  67         line_length += len(word) + 1
  68         if line_length > wrapping:
  69             # Max line length was exceeded. We create new line
  70             lines.append([])
  71             line_length = len(word)
  72         lines[-1].append(word)
  73     return '\n'.join(' '.join(line) for line in lines)
  74
  75
  76 # Register substitute_entities function with lxml
  77 ns = etree.FunctionNamespace('http://wolnelektury.pl/functions')
  78 ns['strip'] = strip
  79 ns['substitute_entities'] = substitute_entities
  80 ns['wrap_words'] = wrap_words
  81
  82
  83 def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
  84     """Transforms file input_filename in XML to output_filename in TXT."""
  85     # Parse XSLT
  86     style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
  87     style = etree.parse(style_filename)
  88
  89     if is_file:
  90         document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
  91     else:
  92         document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
  93
  94     result = document.transform(style, **options)
  95
  96     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
  97
  98     if parse_dublincore:
  99         url = dcparser.parse(input_filename).url
 100     else:
 101         url = '*' * 10
 102     output_file.write(TEMPLATE % {
 103         'url': url,
 104         'text': unicode(result),
 105     })
 106