librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 from copy import deepcopy
  14 import os
  15 import os.path
  16 import shutil
  17 from StringIO import StringIO
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 import re
  20 import random
  21 from copy import deepcopy
  22 from subprocess import call, PIPE
  23 from urllib2 import urlopen
  24
  25 from Texml.processor import process
  26 from lxml import etree
  27 from lxml.etree import XMLSyntaxError, XSLTApplyError
  28
  29 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  30 from librarian.dcparser import Person
  31 from librarian.parser import WLDocument
  32 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  33 from librarian import functions
  34 from pdf import PDFFormat
  35
  36
  37
  38 def escape(really):
  39     def deco(f):
  40         def _wrap(*args, **kw):
  41             value = f(*args, **kw)
  42
  43             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  44             postfix = u'</TeXML>'
  45             if isinstance(value, list):
  46                 import pdb; pdb.set_trace()
  47             if isinstance(value, tuple):
  48                 return prefix + value[0], value[1] + postfix
  49             else:
  50                 return prefix + value + postfix
  51         return _wrap
  52     return deco
  53
  54
  55 def cmd(name, parms=None):
  56     def wrap(self, element=None):
  57         pre, post = tag_open_close('cmd', name=name)
  58
  59         if parms:
  60             for parm in parms:
  61                 e = etree.Element("parm")
  62                 e.text = parm
  63                 pre += etree.tostring(e)
  64         if element is not None:
  65             pre += "<parm>"
  66             post = "</parm>" + post
  67             return pre, post
  68         else:
  69             return pre + post
  70     return wrap
  71
  72
  73 def mark_alien_characters(text):
  74     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  75     return text
  76
  77
  78 class EduModule(Xmill):
  79     def __init__(self, options=None):
  80         super(EduModule, self).__init__(options)
  81         self.activity_counter = 0
  82         self.exercise_counter = 0
  83
  84         def swap_endlines(txt):
  85             if self.options['strofa']:
  86                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  87             return txt
  88         self.register_text_filter(functions.substitute_entities)
  89         self.register_text_filter(mark_alien_characters)
  90         self.register_text_filter(swap_endlines)
  91
  92     def get_dc(self, element, dc_field, single=False):
  93         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  94         if single:
  95             return values[0]
  96         return values
  97
  98     def handle_rdf__RDF(self, _):
  99         "skip metadata in generation"
 100         return
 101
 102     @escape(True)
 103     def get_rightsinfo(self, element):
 104         rights_lic = self.get_dc(element, 'rights.license', True)
 105         return u'<cmd name="rightsinfostr">' + \
 106           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
 107           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
 108           u'</cmd>'
 109
 110     @escape(True)
 111     def get_authors(self, element):
 112         authors = self.get_dc(element, 'creator.expert') + \
 113           self.get_dc(element, 'creator.scenario') + \
 114           self.get_dc(element, 'creator.textbook')
 115         return u', '.join(authors)
 116
 117     @escape(1)
 118     def get_title(self, element):
 119         return self.get_dc(element, 'title', True)
 120
 121     def handle_utwor(self, element):
 122         lines = [
 123             u'''
 124     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 125         <TeXML escape="0">
 126         \\documentclass[%s]{wl}
 127         \\usepackage{style}''' % self.options['customization_str'],
 128     self.options['has_cover'] and '\usepackage{makecover}',
 129     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 130     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 131     (self.options['morefloats'] == 'none' and
 132      u'''\\IfFileExists{morefloats.sty}{
 133             \\usepackage{morefloats}
 134         }{}'''),
 135     u'''\\def\\authors{%s}''' % self.get_authors(element),
 136     u'''\\author{\\authors}''',
 137     u'''\\title{%s}''' % self.get_title(element),
 138     u'''\\def\\bookurl{%s}''' % self.get_dc(element, 'identifier.url', True),
 139     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 140     u'</TeXML>']
 141
 142         return u"".join(filter(None, lines)), u'</TeXML>'
 143
 144
 145     @escape(1)
 146     def handle_powiesc(self, element):
 147         return u"""
 148     <env name="document">
 149     <cmd name="maketitle"/>
 150     """, """</env>"""
 151
 152     @escape(1)
 153     def handle_texcommand(self, element):
 154         cmd = functions.texcommand(element.tag)
 155         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 156
 157     handle_akap = \
 158     handle_akap = \
 159     handle_akap_cd = \
 160     handle_akap_cd = \
 161     handle_akap_dialog = \
 162     handle_akap_dialog = \
 163     handle_autor_utworu = \
 164     handle_dedykacja = \
 165     handle_didaskalia = \
 166     handle_didask_tekst = \
 167     handle_dlugi_cytat = \
 168     handle_dzielo_nadrzedne = \
 169     handle_lista_osoba = \
 170     handle_mat = \
 171     handle_miejsce_czas = \
 172     handle_motto = \
 173     handle_motto_podpis = \
 174     handle_naglowek_akt = \
 175     handle_naglowek_czesc = \
 176     handle_naglowek_listy = \
 177     handle_naglowek_osoba = \
 178     handle_naglowek_podrozdzial = \
 179     handle_naglowek_podrozdzial = \
 180     handle_naglowek_rozdzial = \
 181     handle_naglowek_rozdzial = \
 182     handle_naglowek_scena = \
 183     handle_nazwa_utworu = \
 184     handle_nota = \
 185     handle_osoba = \
 186     handle_pa = \
 187     handle_pe = \
 188     handle_podtytul = \
 189     handle_poezja_cyt = \
 190     handle_pr = \
 191     handle_pt = \
 192     handle_sekcja_asterysk = \
 193     handle_sekcja_swiatlo = \
 194     handle_separator_linia = \
 195     handle_slowo_obce = \
 196     handle_srodtytul = \
 197     handle_tytul_dziela = \
 198     handle_wyroznienie = \
 199     handle_texcommand
 200
 201     _handle_strofa = cmd("strofa")
 202
 203     def handle_strofa(self, element):
 204         self.options = {'strofa': True}
 205         return self._handle_strofa(element)
 206
 207     def handle_aktywnosc(self, element):
 208         self.activity_counter += 1
 209         self.options = {
 210             'activity': True,
 211             'activity_counter': self.activity_counter,
 212             'sub_gen': True,
 213         }
 214         submill = EduModule(self.options)
 215
 216         opis = submill.generate(element.xpath('opis')[0])
 217
 218         n = element.xpath('wskazowki')
 219         if n: wskazowki = submill.generate(n[0])
 220
 221         else: wskazowki = ''
 222         n = element.xpath('pomoce')
 223
 224         if n: pomoce = submill.generate(n[0])
 225         else: pomoce = ''
 226
 227         forma = ''.join(element.xpath('forma/text()'))
 228
 229         czas = ''.join(element.xpath('czas/text()'))
 230
 231         counter = self.activity_counter
 232
 233         return u"""
 234
 235 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 236 <cmd name="activityinfo"><parm>
 237  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 238  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 239  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 240 </parm></cmd>
 241
 242
 243 %(opis)s
 244
 245 %(wskazowki)s
 246 """ % locals()
 247
 248     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 249     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 250
 251     @ifoption(sub_gen=True)
 252     def handle_pomoce(self, _):
 253         return "Pomoce: ", ""
 254
 255     def handle_czas(self, *_):
 256         return
 257
 258     def handle_forma(self, *_):
 259         return
 260
 261     def handle_lista(self, element, attrs={}):
 262         if not element.findall("punkt"):
 263             return None
 264         ltype = element.attrib.get('typ', 'punkt')
 265         if ltype == 'slowniczek':
 266             surl = element.attrib.get('href', None)
 267             sxml = None
 268             if surl:
 269                 sxml = etree.fromstring(self.options['provider'].by_uri(surl).get_string())
 270             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 271
 272         listcmd = {'num': 'enumerate',
 273                'punkt': 'itemize',
 274                'alfa': 'itemize',
 275                'slowniczek': 'itemize',
 276                'czytelnia': 'itemize'}[ltype]
 277
 278         return u'<env name="%s">' % listcmd, u'</env>'
 279
 280     def handle_punkt(self, element):
 281         return '<cmd name="item"/>', ''
 282
 283     def handle_cwiczenie(self, element):
 284         exercise_handlers = {
 285             'wybor': Wybor,
 286             'uporzadkuj': Uporzadkuj,
 287             'luki': Luki,
 288             'zastap': Zastap,
 289             'przyporzadkuj': Przyporzadkuj,
 290             'prawdafalsz': PrawdaFalsz
 291         }
 292
 293         typ = element.attrib['typ']
 294         self.exercise_counter += 1
 295         if not typ in exercise_handlers:
 296             return '(no handler)'
 297         self.options = {'exercise_counter': self.exercise_counter}
 298         handler = exercise_handlers[typ](self.options)
 299         return handler.generate(element)
 300
 301     # XXX this is copied from pyhtml.py, except for return and
 302     # should be refactored for no code duplication
 303     def handle_definiendum(self, element):
 304         nxt = element.getnext()
 305         definiens_s = ''
 306
 307         # let's pull definiens from another document
 308         if self.options['slowniczek_xml'] and (not nxt or nxt.tag != 'definiens'):
 309             sxml = self.options['slowniczek_xml']
 310             assert element.text != ''
 311             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 312             if defloc:
 313                 definiens = defloc[0].getnext()
 314                 if definiens.tag == 'definiens':
 315                     subgen = EduModule(self.options)
 316                     definiens_s = subgen.generate(definiens)
 317
 318         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 319
 320     def handle_definiens(self, element):
 321         return u"", u""
 322
 323     def handle_podpis(self, element):
 324         return u"""<env name="figure">""", u"</env>"
 325
 326     def handle_tabela(self, element):
 327         max_col = 0
 328         for w in element.xpath("wiersz"):
 329             ks = w.xpath("kol")
 330             if max_col < len(ks):
 331                 max_col = len(ks)
 332         self.options = {'columnts': max_col}
 333         # styling:
 334                 #        has_frames = int(element.attrib.get("ramki", "0"))
 335                 #        if has_frames: frames_c = "framed"
 336                 #        else: frames_c = ""
 337                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 338         return u'''
 339 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 340     ''' % ('l' * max_col), \
 341     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 342
 343     @escape(1)
 344     def handle_wiersz(self, element):
 345         return u"", u'<ctrl ch="\\"/>'
 346
 347     @escape(1)
 348     def handle_kol(self, element):
 349         if element.getnext() is not None:
 350             return u"", u'<spec cat="align" />'
 351         return u"", u""
 352
 353     def handle_link(self, element):
 354         if element.attrib.get('url'):
 355             url = element.attrib.get('url')
 356             if url == element.text:
 357                 return cmd('url')(self, element)
 358             else:
 359                 return cmd('href', parms=[element.attrib['url']])(self, element)
 360         else:
 361             return cmd('em')(self, element)
 362
 363     def handle_obraz(self, element):
 364         frmt = self.options['format']
 365         name = element.attrib['nazwa'].strip()
 366         image = frmt.get_image(name.strip())
 367         img_path = "obraz/%s" % name.replace("_", "")
 368         frmt.attachments[img_path] = image
 369         return cmd("obraz", parms=[img_path])(self)
 370
 371     def handle_video(self, element):
 372         url = element.attrib.get('url')
 373         if not url:
 374             print '!! <video> missing url'
 375             return
 376         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 377         if not m:
 378             print '!! unknown <video> url scheme:', url
 379             return
 380         name = m.group(1)
 381         thumb = IOFile.from_string(urlopen
 382             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 383         img_path = "video/%s.jpg" % name.replace("_", "")
 384         self.options['format'].attachments[img_path] = thumb
 385         canon_url = "https://www.youtube.com/watch?v=%s" % name
 386         return cmd("video", parms=[img_path, canon_url])(self)
 387
 388
 389 class Exercise(EduModule):
 390     def __init__(self, *args, **kw):
 391         self.question_counter = 0
 392         super(Exercise, self).__init__(*args, **kw)
 393
 394     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 395
 396     def handle_cwiczenie(self, element):
 397         self.options = {
 398             'exercise': element.attrib['typ'],
 399             'sub_gen': True,
 400         }
 401         self.question_counter = 0
 402         self.piece_counter = 0
 403
 404         header = etree.Element("parm")
 405         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 406         header_cmd.append(header)
 407         header.text = u"Zadanie %d." % self.options['exercise_counter']
 408
 409         pre = etree.tostring(header_cmd, encoding=unicode)
 410         post = u""
 411         # Add a single <pytanie> tag if it's not there
 412         if not element.xpath(".//pytanie"):
 413             qpre, qpost = self.handle_pytanie(element)
 414             pre = pre + qpre
 415             post = qpost + post
 416         return pre, post
 417
 418     def handle_pytanie(self, element):
 419         """This will handle <cwiczenie> element, when there is no <pytanie>
 420         """
 421         self.question_counter += 1
 422         self.piece_counter = 0
 423         pre = post = u""
 424         if self.options['teacher'] and element.attrib.get('rozw'):
 425             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 426         return pre, post
 427
 428     def handle_punkt(self, element):
 429         pre, post = super(Exercise, self).handle_punkt(element)
 430         if self.options['teacher'] and element.attrib.get('rozw'):
 431             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 432         return pre, post
 433
 434     def solution_header(self):
 435         par = etree.Element("cmd", name="par")
 436         parm = etree.Element("parm")
 437         parm.text = u"Rozwiązanie:"
 438         par.append(parm)
 439         return etree.tostring(par)
 440
 441     def explicit_solution(self):
 442         if self.options['solution']:
 443             par = etree.Element("cmd", name="par")
 444             parm = etree.Element("parm")
 445             parm.text = self.options['solution']
 446             par.append(parm)
 447             return self.solution_header() + etree.tostring(par)
 448
 449
 450
 451 class Wybor(Exercise):
 452     def handle_cwiczenie(self, element):
 453         pre, post = super(Wybor, self).handle_cwiczenie(element)
 454         is_single_choice = True
 455         pytania = element.xpath(".//pytanie")
 456         if not pytania:
 457             pytania = [element]
 458         for p in pytania:
 459             solutions = re.split(r"[, ]+", p.attrib['rozw'])
 460             if len(solutions) != 1:
 461                 is_single_choice = False
 462                 break
 463             choices = p.xpath(".//*[@nazwa]")
 464             uniq = set()
 465             for n in choices: uniq.add(n.attrib['nazwa'])
 466             if len(choices) != len(uniq):
 467                 is_single_choice = False
 468                 break
 469
 470         self.options = {'single': is_single_choice}
 471         return pre, post
 472
 473     def handle_punkt(self, element):
 474         if self.options['exercise'] and element.attrib.get('nazwa', None):
 475             cmd = 'radio' if self.options['single'] else 'checkbox'
 476             return u'<cmd name="%s"/>' % cmd, ''
 477         else:
 478             return super(Wybor, self).handle_punkt(element)
 479
 480
 481 class Uporzadkuj(Exercise):
 482     def handle_pytanie(self, element):
 483         order_items = element.xpath(".//punkt/@rozw")
 484         return super(Uporzadkuj, self).handle_pytanie(element)
 485
 486
 487 class Przyporzadkuj(Exercise):
 488     def handle_lista(self, lista):
 489         header = etree.Element("parm")
 490         header_cmd = etree.Element("cmd", name="par")
 491         header_cmd.append(header)
 492         if 'nazwa' in lista.attrib:
 493             header.text = u"Kategorie:"
 494         elif 'cel' in lista.attrib:
 495             header.text = u"Elementy do przyporządkowania:"
 496         else:
 497             header.text = u"Lista:"
 498         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 499         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 500         return pre, post
 501
 502
 503 class Luki(Exercise):
 504     def find_pieces(self, question):
 505         return question.xpath(".//luka")
 506
 507     def solution(self, piece):
 508         piece = deepcopy(piece)
 509         piece.tail = None
 510         sub = EduModule()
 511         return sub.generate(piece)
 512
 513     def handle_pytanie(self, element):
 514         qpre, qpost = super(Luki, self).handle_pytanie(element)
 515
 516         luki = self.find_pieces(element)
 517         random.shuffle(luki)
 518         self.words = u"<env name='itemize'>%s</env>" % (
 519             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 520         )
 521         return qpre, qpost
 522
 523     def handle_opis(self, element):
 524         return '', self.words
 525
 526     def handle_luka(self, element):
 527         luka = "_" * 10
 528         if self.options['teacher']:
 529             piece = deepcopy(element)
 530             piece.tail = None
 531             sub = EduModule()
 532             text = sub.generate(piece)
 533             luka += u" [rozwiązanie: %s]" % text
 534         return luka
 535
 536
 537 class Zastap(Luki):
 538     def find_pieces(self, question):
 539         return question.xpath(".//zastap")
 540
 541     def solution(self, piece):
 542         return piece.attrib['rozw']
 543
 544     def list_header(self):
 545         return u"Elementy do wstawienia"
 546
 547     def handle_zastap(self, element):
 548         piece = deepcopy(element)
 549         piece.tail = None
 550         sub = EduModule()
 551         text = sub.generate(piece)
 552         if self.options['teacher'] and element.attrib.get('rozw'):
 553             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 554         return text
 555
 556
 557 class PrawdaFalsz(Exercise):
 558     def handle_punkt(self, element):
 559         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 560         if 'rozw' in element.attrib:
 561             post += u" [Prawda/Fałsz]"
 562         return pre, post
 563
 564
 565
 566 def fix_lists(tree):
 567     lists = tree.xpath(".//lista")
 568     for l in lists:
 569         if l.text:
 570             p = l.getprevious()
 571             if p is not None:
 572                 if p.tail is None: p.tail = ''
 573                 p.tail += l.text
 574             else:
 575                 p = l.getparent()
 576                 if p.text is None: p.text = ''
 577                 p.text += l.text
 578             l.text = ''
 579     return tree
 580
 581
 582 class EduModulePDFFormat(PDFFormat):
 583     def get_texml(self):
 584         self.attachments = {}
 585         edumod = EduModule({
 586             "format": self,
 587             "teacher": self.customization.get('teacher'),
 588         })
 589         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 590
 591         open("/tmp/texml.xml", "w").write(texml)
 592         return texml
 593
 594     def get_tex_dir(self):
 595         temp = super(EduModulePDFFormat, self).get_tex_dir()
 596         for name, iofile in self.attachments.items():
 597             iofile.save_as(os.path.join(temp, name))
 598         return temp
 599
 600     def get_image(self, name):
 601         return self.wldoc.source.attachments[name]
 602