From a7133c06fd9738c11a2bf60b4fc09365d15da1d7 Mon Sep 17 00:00:00 2001 From: Marcin Koziej Date: Wed, 6 Feb 2013 16:59:52 +0100 Subject: [PATCH 1/1] first take at pdf generation. --- librarian/functions.py | 18 +-- librarian/pyhtml.py | 2 +- librarian/pypdf.py | 353 +++++++++++++++++++++++++++++++++++++++++ librarian/xmlutils.py | 4 +- 4 files changed, 365 insertions(+), 12 deletions(-) create mode 100644 librarian/pypdf.py diff --git a/librarian/functions.py b/librarian/functions.py index 40f06cd..bd05ff4 100644 --- a/librarian/functions.py +++ b/librarian/functions.py @@ -11,7 +11,7 @@ from librarian.dcparser import Person def _register_function(f): """ Register extension function with lxml """ ns = etree.FunctionNamespace('http://wolnelektury.pl/functions') - ns[f.__name__] = f + ns[f.__name__] = lambda context, *args: f(*args) ENTITY_SUBSTITUTIONS = [ @@ -22,7 +22,7 @@ ENTITY_SUBSTITUTIONS = [ (u'"', u'”'), ] -def substitute_entities(context, text): +def substitute_entities(text): """XPath extension function converting all entites in passed text.""" if isinstance(text, list): text = ''.join(text) @@ -35,7 +35,7 @@ def reg_substitute_entities(): _register_function(substitute_entities) -def strip(context, text): +def strip(text): """Remove unneeded whitespace from beginning and end""" if isinstance(text, list): text = ''.join(text) @@ -46,7 +46,7 @@ def reg_strip(): _register_function(strip) -def starts_white(context, text): +def starts_white(text): if isinstance(text, list): text = ''.join(text) if not text: @@ -59,7 +59,7 @@ def reg_starts_white(): def reg_ends_white(): - def ends_white(context, text): + def ends_white(text): if isinstance(text, list): text = ''.join(text) if not text: @@ -68,7 +68,7 @@ def reg_ends_white(): _register_function(ends_white) -def wrap_words(context, text, wrapping): +def wrap_words(text, wrapping): """XPath extension function automatically wrapping words in passed text""" if isinstance(text, list): text = ''.join(text) @@ -93,7 +93,7 @@ def reg_wrap_words(): _register_function(wrap_words) -def person_name(context, text): +def person_name(text): """ Converts "Name, Forename" to "Forename Name" """ if isinstance(text, list): text = ''.join(text) @@ -104,7 +104,7 @@ def reg_person_name(): _register_function(person_name) -def texcommand(context, text): +def texcommand(text): """Remove non-letters""" if isinstance(text, list): text = ''.join(text) @@ -116,7 +116,7 @@ def reg_texcommand(): def reg_get(format_): - def get(context, *args): + def get(*args): obj = format_ for arg in args: if hasattr(obj, arg): diff --git a/librarian/pyhtml.py b/librarian/pyhtml.py index e5ca40e..b3c3ce0 100644 --- a/librarian/pyhtml.py +++ b/librarian/pyhtml.py @@ -16,7 +16,7 @@ class EduModule(Xmill): def __init__(self, options=None): super(EduModule, self).__init__(options) self.activity_counter = 0 - self.register_text_filter(lambda t: functions.substitute_entities(None, t)) + self.register_text_filter(functions.substitute_entities) def handle_powiesc(self, element): return u""" diff --git a/librarian/pypdf.py b/librarian/pypdf.py new file mode 100644 index 0000000..cb082ca --- /dev/null +++ b/librarian/pypdf.py @@ -0,0 +1,353 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +"""PDF creation library. + +Creates one big XML from the book and its children, converts it to LaTeX +with TeXML, then runs it by XeLaTeX. + +""" +from __future__ import with_statement +import os +import os.path +import shutil +from StringIO import StringIO +from tempfile import mkdtemp, NamedTemporaryFile +import re +from copy import deepcopy +from subprocess import call, PIPE + +from Texml.processor import process +from lxml import etree +from lxml.etree import XMLSyntaxError, XSLTApplyError + +from xmlutils import Xmill, tag, tagged, ifoption +from librarian.dcparser import Person +from librarian.parser import WLDocument +from librarian import ParseError, DCNS, get_resource, IOFile, Format +from librarian import functions +from pdf import PDFFormat + + +def escape(really): + def deco(f): + def _wrap(*args, **kw): + value = f(*args, **kw) + + prefix = (u'' % (really and 1 or 0)) + postfix = u'' + if isinstance(value, list): + import pdb; pdb.set_trace() + if isinstance(value, tuple): + return prefix + value[0], value[1] + postfix + else: + return prefix + value + postfix + return _wrap + return deco + + +def cmd(name, pass_text=False): + def wrap(self, element): + pre = u'' % name + + if pass_text: + pre += "%s" % element.text + return pre + '' + else: + return pre, '' + return wrap + + +def mark_alien_characters(text): + text = re.sub(ur"([\u0400-\u04ff]+)", ur"\1", text) + return text + + +class EduModule(Xmill): + def __init__(self, options=None): + super(EduModule, self).__init__(options) + self.activity_counter = 0 + self.register_text_filter(functions.substitute_entities) + self.register_text_filter(mark_alien_characters) + + def get_dc(self, element, dc_field, single=False): + values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri})) + if single: + return values[0] + return values + + def handle_rdf__RDF(self, _): + "skip metadata in generation" + return + + @escape(True) + def get_rightsinfo(self, element): + rights_lic = self.get_dc(element, 'rights.license', True) + return u'' + \ + (rights_lic and u'%s' % rights_lic or '') +\ + u'%s' % self.get_dc(element, 'rights', True) +\ + u'' + + @escape(True) + def get_authors(self, element): + authors = self.get_dc(element, 'creator.expert') + \ + self.get_dc(element, 'creator.scenario') + \ + self.get_dc(element, 'creator.textbook') + return u', '.join(authors) + + @escape(1) + def get_title(self, element): + return self.get_dc(element, 'title', True) + + def handle_utwor(self, element): + lines = [ + u''' + + + \\documentclass[%s]{wl} + \\usepackage{style}''' % self.options['customization_str'], + self.options['has_cover'] and '\usepackage{makecover}', + (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or + (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or + (self.options['morefloats'] == 'none' and + u'''\\IfFileExists{morefloats.sty}{ + \\usepackage{morefloats} + }{}'''), + u'''\\def\\authors{%s}''' % self.get_authors(element), + u'''\\author{\\authors}''', + u'''\\title{%s}''' % self.get_title(element), + u'''\\def\\bookurl{%s}''' % self.get_dc(element, 'identifier.url', True), + u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element), + u''] + + return u"".join(filter(None, lines)), u'' + + + handle_naglowek_rozdzial = escape(True)(cmd("naglowekrozdzial", True)) + handle_naglowek_podrozdzial = escape(True)(cmd("naglowekpodrozdzial", True)) + + @escape(1) + def handle_powiesc(self, element): + return u""" + + + """, """""" + + handle_autor_utworu = cmd('autorutworu', True) + handle_nazwa_utworu = cmd('nazwautworu', True) + handle_dzielo_nadrzedne = cmd('dzielonadrzedne', True) + handle_podtytul = cmd('podtytul', True) + + handle_akap = handle_akap_dialog = handle_akap_cd = lambda s, e: ("\n", "\n") + handle_strofa = lambda s, e: ("\n","\n") + + def handle_aktywnosc(self, element): + self.activity_counter += 1 + self.options = { + 'activity': True, + 'activity_counter': self.activity_counter, + 'sub_gen': True, + } + submill = EduModule(self.options) + + opis = submill.generate(element.xpath('opis')[0]) + + n = element.xpath('wskazowki') + if n: wskazowki = submill.generate(n[0]) + + else: wskazowki = '' + n = element.xpath('pomoce') + + if n: pomoce = submill.generate(n[0]) + else: pomoce = '' + + forma = ''.join(element.xpath('forma/text()')) + + czas = ''.join(element.xpath('czas/text()')) + + counter = self.activity_counter + + return u""" +Czas: %(czas)s min +Forma: %(forma)s +%(pomoce)s + +%(counter)d. %(opis)s + +%(wskazowki)s +""" % locals() + + handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', '')) + handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', '')) + + @ifoption(sub_gen=True) + def handle_pomoce(self, _): + return "Pomoce: ", "" + + def handle_czas(self, *_): + return + + def handle_forma(self, *_): + return + +# def handle_cwiczenie(self, element): +# exercise_handlers = { +# 'wybor': Wybor, +# 'uporzadkuj': Uporzadkuj, +# 'luki': Luki, +# 'zastap': Zastap, +# 'przyporzadkuj': Przyporzadkuj, +# 'prawdafalsz': PrawdaFalsz +# } + +# typ = element.attrib['typ'] +# handler = exercise_handlers[typ](self.options) +# return handler.generate(element) + +# # Lists +# def handle_lista(self, element, attrs={}): +# ltype = element.attrib.get('typ', 'punkt') +# if ltype == 'slowniczek': +# surl = element.attrib.get('href', None) +# sxml = None +# if surl: +# sxml = etree.fromstring(self.options['provider'].by_uri(surl).get_string()) +# self.options = {'slowniczek': True, 'slowniczek_xml': sxml } +# return '
', '
' + +# listtag = {'num': 'ol', +# 'punkt': 'ul', +# 'alfa': 'ul', +# 'czytelnia': 'ul'}[ltype] + +# classes = attrs.get('class', '') +# if classes: del attrs['class'] + +# attrs_s = ' '.join(['%s="%s"' % kv for kv in attrs.items()]) +# if attrs_s: attrs_s = ' ' + attrs_s + +# return '<%s class="lista %s %s"%s>' % (listtag, ltype, classes, attrs_s), '' % listtag + +# def handle_punkt(self, element): +# if self.options['slowniczek']: +# return '
', '
' +# else: +# return '
  • ', '
  • ' + +# def handle_definiendum(self, element): +# nxt = element.getnext() +# definiens_s = '' + +# # let's pull definiens from another document +# if self.options['slowniczek_xml'] and (not nxt or nxt.tag != 'definiens'): +# sxml = self.options['slowniczek_xml'] +# assert element.text != '' +# defloc = sxml.xpath("//definiendum[text()='%s']" % element.text) +# if defloc: +# definiens = defloc[0].getnext() +# if definiens.tag == 'definiens': +# subgen = EduModule(self.options) +# definiens_s = subgen.generate(definiens) + +# return u"
    ", u"
    " + definiens_s + +# def handle_definiens(self, element): +# return u"
    ", u"
    " + + +# def handle_podpis(self, element): +# return u"""
    """, u"
    " + +# def handle_tabela(self, element): +# has_frames = int(element.attrib.get("ramki", "0")) +# if has_frames: frames_c = "framed" +# else: frames_c = "" +# return u"""""" % frames_c, u"
    " + +# def handle_wiersz(self, element): +# return u"", u"" + +# def handle_kol(self, element): +# return u"", u"" + +# def handle_rdf__RDF(self, _): +# # ustal w opcjach rzeczy :D +# return + +# def handle_link(self, element): +# if 'material' in element.attrib: +# formats = re.split(r"[, ]+", element.attrib['format']) +# fmt_links = [] +# for f in formats: +# fmt_links.append(u'%s' % (self.options['urlmapper'].url_for_material(element.attrib['material'], f), f.upper())) + +# return u"", u' (%s)' % u' '.join(fmt_links) + + +# class Exercise(EduModule): +# def __init__(self, *args, **kw): +# self.question_counter = 0 +# super(Exercise, self).__init__(*args, **kw) + +# def handle_rozw_kom(self, element): +# return u"""""" + +# def handle_cwiczenie(self, element): +# self.options = {'exercise': element.attrib['typ']} +# self.question_counter = 0 +# self.piece_counter = 0 + +# pre = u""" +#
    +#
    +# """ % element.attrib +# post = u""" +#
    +# +# +# +# +# +#
    +#
    +#
    +# """ +# # Add a single tag if it's not there +# if not element.xpath(".//pytanie"): +# qpre, qpost = self.handle_pytanie(element) +# pre = pre + qpre +# post = qpost + post +# return pre, post + +# def handle_pytanie(self, element): +# """This will handle element, when there is no +# """ +# add_class = "" +# self.question_counter += 1 +# self.piece_counter = 0 +# solution = element.attrib.get('rozw', None) +# if solution: solution_s = ' data-solution="%s"' % solution +# else: solution_s = '' + +# handles = element.attrib.get('uchwyty', None) +# if handles: +# add_class += ' handles handles-%s' % handles +# self.options = {'handles': handles} + +# minimum = element.attrib.get('min', None) +# if minimum: minimum_s = ' data-minimum="%d"' % int(minimum) +# else: minimum_s = '' + +# return '
    ' %\ +# (add_class, self.question_counter, solution_s + minimum_s), \ +# "
    " + +class EduModulePDFFormat(PDFFormat): + def get_texml(self): + edumod = EduModule() + texml = edumod.generate(self.wldoc.edoc.getroot()).encode('utf-8') + + open("/tmp/texml.xml", "w").write(texml) + return texml diff --git a/librarian/xmlutils.py b/librarian/xmlutils.py index d762320..819c9a4 100644 --- a/librarian/xmlutils.py +++ b/librarian/xmlutils.py @@ -53,8 +53,6 @@ class Xmill(object): tagname = None # from nose.tools import set_trace - if isinstance(element, etree._Comment): return None - if element.tag[0] == '{': for nshort, nhref in element.nsmap.items(): try: @@ -89,6 +87,8 @@ class Xmill(object): if element is None: return None # end of tree def _handle_element(self, element): + if isinstance(element, etree._Comment): return None + handler = self._handle_for_element(element) # How many scopes try: -- 2.20.1