librarian/text.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from librarian import dcparser, parser, functions
   7 from lxml import etree
   8 import cStringIO
   9 import codecs
  10 import os
  11 import re
  12
  13
  14 functions.reg_substitute_entities()
  15 functions.reg_wrap_words()
  16 functions.reg_strip()
  17
  18 TEMPLATE = u"""\
  19 Kodowanie znaków w dokumencie: UTF-8.
  20 -----
  21 %(description)s
  22
  23 %(license_description)s.%(source)s
  24
  25 Wersja lektury w opracowaniu merytorycznym i krytycznym (przypisy i motywy) dostępna jest na stronie %(url)s.
  26 -----
  27
  28
  29
  30 %(text)s
  31 """
  32
  33 def transform(input_filename, output_filename, is_file=True, parse_dublincore=True, **options):
  34     """Transforms file input_filename in XML to output_filename in TXT."""
  35     # Parse XSLT
  36     style_filename = os.path.join(os.path.dirname(__file__), 'xslt/book2txt.xslt')
  37     style = etree.parse(style_filename)
  38
  39     if is_file:
  40         document = parser.WLDocument.from_file(input_filename, True, parse_dublincore=parse_dublincore)
  41     else:
  42         document = parser.WLDocument.from_string(input_filename, True, parse_dublincore=parse_dublincore)
  43
  44     result = document.transform(style, **options)
  45
  46     output_file = codecs.open(output_filename, 'wb', encoding='utf-8')
  47
  48     if parse_dublincore:
  49         parsed_dc = dcparser.parse(input_filename)
  50         description = parsed_dc.description
  51         url = parsed_dc.url
  52         license_description = parsed_dc.license_description
  53         license = parsed_dc.license
  54         if license:
  55             license_description = u"Ten utwór jest udostepniony na licencji %s: \n%s" % (license_description, license)
  56         else:
  57             license_description = u"Ten utwór nie jest chroniony prawem autorskim i znajduje się w domenie publicznej, co oznacza, że możesz go swobodnie wykorzystywać, publikować i rozpowszechniać"
  58         source = parsed_dc.source_name
  59         if source:
  60             source = "\n\nNa podstawie: " + source
  61         else:
  62             source = ''
  63     else:
  64         description = 'Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl).'
  65         url = '*' * 10
  66         license = ""
  67         license_description = ""
  68         source = ""
  69     output_file.write(TEMPLATE % {
  70         'description': description,
  71         'url': url,
  72         'license_description': license_description,
  73         'text': unicode(result),
  74         'source': source,
  75     })
  76