librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from xmlutils import Xmill, tag, tagged, ifoption
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  30 from librarian import functions
  31 from pdf import PDFFormat
  32
  33
  34
  35 def escape(really):
  36     def deco(f):
  37         def _wrap(*args, **kw):
  38             value = f(*args, **kw)
  39
  40             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  41             postfix = u'</TeXML>'
  42             if isinstance(value, list):
  43                 import pdb; pdb.set_trace()
  44             if isinstance(value, tuple):
  45                 return prefix + value[0], value[1] + postfix
  46             else:
  47                 return prefix + value + postfix
  48         return _wrap
  49     return deco
  50
  51
  52 def cmd(name, pass_text=False):
  53     def wrap(self, element):
  54         pre = u'<cmd name="%s">' % name
  55
  56         if pass_text:
  57             pre += "<parm>%s</parm>" % element.text
  58             return pre + '</cmd>'
  59         else:
  60             return pre, '</cmd>'
  61     return wrap
  62
  63
  64 def mark_alien_characters(text):
  65     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  66     return text
  67
  68
  69 class EduModule(Xmill):
  70     def __init__(self, options=None):
  71         super(EduModule, self).__init__(options)
  72         self.activity_counter = 0
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\"/>')
  78             return txt
  79         self.register_text_filter(functions.substitute_entities)
  80         self.register_text_filter(mark_alien_characters)
  81         self.register_text_filter(swap_endlines)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         "skip metadata in generation"
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + \
  97           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  98           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  99           u'</cmd>'
 100
 101     @escape(True)
 102     def get_authors(self, element):
 103         authors = self.get_dc(element, 'creator.expert') + \
 104           self.get_dc(element, 'creator.scenario') + \
 105           self.get_dc(element, 'creator.textbook')
 106         return u', '.join(authors)
 107
 108     @escape(1)
 109     def get_title(self, element):
 110         return self.get_dc(element, 'title', True)
 111
 112     def handle_utwor(self, element):
 113         lines = [
 114             u'''
 115     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 116         <TeXML escape="0">
 117         \\documentclass[%s]{wl}
 118         \\usepackage{style}''' % self.options['customization_str'],
 119     self.options['has_cover'] and '\usepackage{makecover}',
 120     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 121     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 122     (self.options['morefloats'] == 'none' and
 123      u'''\\IfFileExists{morefloats.sty}{
 124             \\usepackage{morefloats}
 125         }{}'''),
 126     u'''\\def\\authors{%s}''' % self.get_authors(element),
 127     u'''\\author{\\authors}''',
 128     u'''\\title{%s}''' % self.get_title(element),
 129     u'''\\def\\bookurl{%s}''' % self.get_dc(element, 'identifier.url', True),
 130     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 131     u'</TeXML>']
 132
 133         return u"".join(filter(None, lines)), u'</TeXML>'
 134
 135
 136     @escape(1)
 137     def handle_powiesc(self, element):
 138         return u"""
 139     <env name="document">
 140     <cmd name="maketitle"/>
 141     """, """</env>"""
 142
 143     @escape(1)
 144     def handle_texcommand(self, element):
 145         cmd = functions.texcommand(element.tag)
 146         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 147
 148     handle_akap = \
 149     handle_akap = \
 150     handle_akap_cd = \
 151     handle_akap_cd = \
 152     handle_akap_dialog = \
 153     handle_akap_dialog = \
 154     handle_autor_utworu = \
 155     handle_dedykacja = \
 156     handle_didaskalia = \
 157     handle_didask_tekst = \
 158     handle_dlugi_cytat = \
 159     handle_dzielo_nadrzedne = \
 160     handle_lista_osoba = \
 161     handle_mat = \
 162     handle_miejsce_czas = \
 163     handle_motto = \
 164     handle_motto_podpis = \
 165     handle_naglowek_akt = \
 166     handle_naglowek_czesc = \
 167     handle_naglowek_listy = \
 168     handle_naglowek_osoba = \
 169     handle_naglowek_podrozdzial = \
 170     handle_naglowek_podrozdzial = \
 171     handle_naglowek_rozdzial = \
 172     handle_naglowek_rozdzial = \
 173     handle_naglowek_scena = \
 174     handle_nazwa_utworu = \
 175     handle_nota = \
 176     handle_osoba = \
 177     handle_pa = \
 178     handle_pe = \
 179     handle_podtytul = \
 180     handle_poezja_cyt = \
 181     handle_pr = \
 182     handle_pt = \
 183     handle_sekcja_asterysk = \
 184     handle_sekcja_swiatlo = \
 185     handle_separator_linia = \
 186     handle_slowo_obce = \
 187     handle_srodtytul = \
 188     handle_tytul_dziela = \
 189     handle_wyroznienie = \
 190     handle_texcommand
 191
 192     _handle_strofa = cmd("strofa", True)
 193
 194     def handle_strofa(self, element):
 195         self.options = {'strofa': True}
 196         return self._handle_strofa(element)
 197
 198     def handle_aktywnosc(self, element):
 199         self.activity_counter += 1
 200         self.options = {
 201             'activity': True,
 202             'activity_counter': self.activity_counter,
 203             'sub_gen': True,
 204         }
 205         submill = EduModule(self.options)
 206
 207         opis = submill.generate(element.xpath('opis')[0])
 208
 209         n = element.xpath('wskazowki')
 210         if n: wskazowki = submill.generate(n[0])
 211
 212         else: wskazowki = ''
 213         n = element.xpath('pomoce')
 214
 215         if n: pomoce = submill.generate(n[0])
 216         else: pomoce = ''
 217
 218         forma = ''.join(element.xpath('forma/text()'))
 219
 220         czas = ''.join(element.xpath('czas/text()'))
 221
 222         counter = self.activity_counter
 223
 224         return u"""
 225
 226 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 227 <cmd name="activityinfo"><parm>
 228  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 229  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 230  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 231 </parm></cmd>
 232
 233
 234 %(opis)s
 235
 236 %(wskazowki)s
 237 """ % locals()
 238
 239     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 240     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 241
 242     @ifoption(sub_gen=True)
 243     def handle_pomoce(self, _):
 244         return "Pomoce: ", ""
 245
 246     def handle_czas(self, *_):
 247         return
 248
 249     def handle_forma(self, *_):
 250         return
 251
 252     def handle_lista(self, element, attrs={}):
 253         ltype = element.attrib.get('typ', 'punkt')
 254         if ltype == 'slowniczek':
 255             surl = element.attrib.get('href', None)
 256             sxml = None
 257             if surl:
 258                 sxml = etree.fromstring(self.options['provider'].by_uri(surl).get_string())
 259             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 260
 261         listcmd = {'num': 'enumerate',
 262                'punkt': 'itemize',
 263                'alfa': 'itemize',
 264                'slowniczek': 'itemize',
 265                'czytelnia': 'itemize'}[ltype]
 266
 267         return u'<env name="%s">' % listcmd, u'</env>'
 268
 269     def handle_punkt(self, element):
 270         return '<cmd name="item"/>', ''
 271
 272     def handle_cwiczenie(self, element):
 273         exercise_handlers = {
 274             'wybor': Wybor}
 275             # 'uporzadkuj': Uporzadkuj,
 276             # 'luki': Luki,
 277             # 'zastap': Zastap,
 278             # 'przyporzadkuj': Przyporzadkuj,
 279             # 'prawdafalsz': PrawdaFalsz
 280
 281         typ = element.attrib['typ']
 282         if not typ in exercise_handlers:
 283             return '(no handler)'
 284         handler = exercise_handlers[typ](self.options)
 285         return handler.generate(element)
 286
 287     # XXX this is copied from pyhtml.py, except for return and
 288     # should be refactored for no code duplication
 289     def handle_definiendum(self, element):
 290         nxt = element.getnext()
 291         definiens_s = ''
 292
 293         # let's pull definiens from another document
 294         if self.options['slowniczek_xml'] and (not nxt or nxt.tag != 'definiens'):
 295             sxml = self.options['slowniczek_xml']
 296             assert element.text != ''
 297             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 298             if defloc:
 299                 definiens = defloc[0].getnext()
 300                 if definiens.tag == 'definiens':
 301                     subgen = EduModule(self.options)
 302                     definiens_s = subgen.generate(definiens)
 303
 304         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 305
 306     def handle_definiens(self, element):
 307         return u"", u""
 308
 309     def handle_podpis(self, element):
 310         return u"""<env name="figure">""", u"</env>"
 311
 312     def handle_tabela(self, element):
 313         max_col = 0
 314         for w in element.xpath("wiersz"):
 315             ks = w.xpath("kol")
 316             if max_col < len(ks):
 317                 max_col = len(ks)
 318         self.options = {'columnts': max_col}
 319         # styling:
 320                 #        has_frames = int(element.attrib.get("ramki", "0"))
 321                 #        if has_frames: frames_c = "framed"
 322                 #        else: frames_c = ""
 323                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 324         return u'''
 325 <cmd name="begin"><parm>tabular</parm><opt>%s</opt></cmd>
 326     ''' % ('l' * max_col), \
 327     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 328
 329     @escape(1)
 330     def handle_wiersz(self, element):
 331         return u"", u'<ctrl ch="\\"/>'
 332
 333     @escape(1)
 334     def handle_kol(self, element):
 335         if element.getnext() is not None:
 336             return u"", u'<spec cat="align">'
 337         return u"", u""
 338
 339     handle_link = cmd('em', True)
 340
 341
 342 class Exercise(EduModule):
 343     def __init__(self, *args, **kw):
 344         self.question_counter = 0
 345         super(Exercise, self).__init__(*args, **kw)
 346
 347     handle_rozw_kom = ifoption(teacher=True)(cmd('akap', True))
 348
 349     def handle_cwiczenie(self, element):
 350         self.options = {'exercise': element.attrib['typ']}
 351         self.question_counter = 0
 352         self.piece_counter = 0
 353
 354         pre = u""
 355         post = u""
 356         # Add a single <pytanie> tag if it's not there
 357         if not element.xpath(".//pytanie"):
 358             qpre, qpost = self.handle_pytanie(element)
 359             pre = pre + qpre
 360             post = qpost + post
 361         return pre, post
 362
 363     def handle_pytanie(self, element):
 364         """This will handle <cwiczenie> element, when there is no <pytanie>
 365         """
 366         opts = {}
 367         self.question_counter += 1
 368         self.piece_counter = 0
 369         solution = element.attrib.get('rozw', None)
 370         if solution:
 371             opts['solution'] = solution
 372
 373         handles = element.attrib.get('uchwyty', None)
 374         if handles:
 375             opts['handles'] = handles
 376
 377         minimum = element.attrib.get('min', None)
 378         if minimum:
 379             opts['minimum'] = minimum
 380
 381         if opts:
 382             self.options = opts
 383         return u"", u""
 384
 385
 386 class Wybor(Exercise):
 387     INSTRUCTION = None
 388
 389     def handle_cwiczenie(self, element):
 390         pre, post = super(Wybor, self).handle_cwiczenie(element)
 391         is_single_choice = True
 392         pytania = element.xpath(".//pytanie")
 393         if not pytania:
 394             pytania = [element]
 395         for p in pytania:
 396             solutions = re.split(r"[, ]+", p.attrib['rozw'])
 397             if len(solutions) != 1:
 398                 is_single_choice = False
 399                 break
 400             choices = p.xpath(".//*[@nazwa]")
 401             uniq = set()
 402             for n in choices: uniq.add(n.attrib['nazwa'])
 403             if len(choices) != len(uniq):
 404                 is_single_choice = False
 405                 break
 406
 407         self.options = {'single': is_single_choice}
 408         return pre, post
 409
 410     def handle_punkt(self, element):
 411         if self.options['exercise'] and element.attrib.get('nazwa', None):
 412             cmd = 'radio' if self.options['single'] else 'checkbox'
 413             return u'<cmd name="%s"/>' % cmd, ''
 414         else:
 415             return super(Wybor, self).handle_punkt(element)
 416
 417
 418
 419
 420 def fix_lists(tree):
 421     lists = tree.xpath(".//lista")
 422     for l in lists:
 423         if l.text:
 424             p = l.getprevious()
 425             if p is not None:
 426                 if p.tail is None: p.tail = ''
 427                 p.tail += l.text
 428             else:
 429                 p = l.getparent()
 430                 if p.text is None: p.text = ''
 431                 p.text += l.text
 432             l.text = ''
 433     return tree
 434
 435
 436 class EduModulePDFFormat(PDFFormat):
 437     def get_texml(self):
 438         edumod = EduModule()
 439         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 440
 441         open("/tmp/texml.xml", "w").write(texml)
 442         return texml