librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 from copy import deepcopy
  14 import os
  15 import os.path
  16 import shutil
  17 from StringIO import StringIO
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 import re
  20 import random
  21 from copy import deepcopy
  22 from subprocess import call, PIPE
  23 from urllib2 import urlopen
  24
  25 from Texml.processor import process
  26 from lxml import etree
  27 from lxml.etree import XMLSyntaxError, XSLTApplyError
  28
  29 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  30 from librarian.dcparser import Person
  31 from librarian.parser import WLDocument
  32 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  33 from librarian import functions
  34 from pdf import PDFFormat
  35
  36
  37
  38 def escape(really):
  39     def deco(f):
  40         def _wrap(*args, **kw):
  41             value = f(*args, **kw)
  42
  43             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  44             postfix = u'</TeXML>'
  45             if isinstance(value, list):
  46                 import pdb; pdb.set_trace()
  47             if isinstance(value, tuple):
  48                 return prefix + value[0], value[1] + postfix
  49             else:
  50                 return prefix + value + postfix
  51         return _wrap
  52     return deco
  53
  54
  55 def cmd(name, parms=None):
  56     def wrap(self, element=None):
  57         pre, post = tag_open_close('cmd', name=name)
  58
  59         if parms:
  60             for parm in parms:
  61                 e = etree.Element("parm")
  62                 e.text = parm
  63                 pre += etree.tostring(e)
  64         if element is not None:
  65             pre += "<parm>"
  66             post = "</parm>" + post
  67             return pre, post
  68         else:
  69             return pre + post
  70     return wrap
  71
  72
  73 def mark_alien_characters(text):
  74     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  75     return text
  76
  77
  78 class EduModule(Xmill):
  79     def __init__(self, options=None):
  80         super(EduModule, self).__init__(options)
  81         self.activity_counter = 0
  82         self.exercise_counter = 0
  83
  84         def swap_endlines(txt):
  85             if self.options['strofa']:
  86                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  87             return txt
  88         self.register_text_filter(functions.substitute_entities)
  89         self.register_text_filter(mark_alien_characters)
  90         self.register_text_filter(swap_endlines)
  91
  92     def get_dc(self, element, dc_field, single=False):
  93         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  94         if single:
  95             return values[0]
  96         return values
  97
  98     def handle_rdf__RDF(self, _):
  99         "skip metadata in generation"
 100         return
 101
 102     @escape(True)
 103     def get_rightsinfo(self, element):
 104         rights_lic = self.get_dc(element, 'rights.license', True)
 105         return u'<cmd name="rightsinfostr">' + \
 106           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
 107           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
 108           u'</cmd>'
 109
 110     @escape(True)
 111     def get_authors(self, element, which=None):
 112         dc = self.options['wldoc'].book_info
 113         if which is None:
 114             authors = dc.authors_textbook + \
 115                 dc.authors_scenario + \
 116                 dc.authors_expert
 117         else:
 118             authors = getattr(dc, "authors_%s" % which)
 119         return u', '.join(author.readable() for author in authors)
 120
 121     @escape(1)
 122     def get_title(self, element):
 123         return self.get_dc(element, 'title', True)
 124
 125     def handle_utwor(self, element):
 126         lines = [
 127             u'''
 128     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 129         <TeXML escape="0">
 130         \\documentclass[%s]{wl}
 131         \\usepackage{style}''' % self.options['customization_str'],
 132     self.options['has_cover'] and '\usepackage{makecover}',
 133     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 134     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 135     (self.options['morefloats'] == 'none' and
 136      u'''\\IfFileExists{morefloats.sty}{
 137             \\usepackage{morefloats}
 138         }{}'''),
 139     u'''\\def\\authors{%s}''' % self.get_authors(element),
 140     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 141     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 142     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 143
 144     u'''\\author{\\authors}''',
 145     u'''\\title{%s}''' % self.get_title(element),
 146     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 147     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 148     u'</TeXML>']
 149
 150         return u"".join(filter(None, lines)), u'</TeXML>'
 151
 152
 153     @escape(1)
 154     def handle_powiesc(self, element):
 155         return u"""
 156     <env name="document">
 157     <cmd name="maketitle"/>
 158     """, """<cmd name="editorialsection" /></env>"""
 159
 160     @escape(1)
 161     def handle_texcommand(self, element):
 162         cmd = functions.texcommand(element.tag)
 163         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 164
 165     handle_akap = \
 166     handle_akap = \
 167     handle_akap_cd = \
 168     handle_akap_cd = \
 169     handle_akap_dialog = \
 170     handle_akap_dialog = \
 171     handle_autor_utworu = \
 172     handle_dedykacja = \
 173     handle_didaskalia = \
 174     handle_didask_tekst = \
 175     handle_dlugi_cytat = \
 176     handle_dzielo_nadrzedne = \
 177     handle_lista_osoba = \
 178     handle_mat = \
 179     handle_miejsce_czas = \
 180     handle_motto = \
 181     handle_motto_podpis = \
 182     handle_naglowek_akt = \
 183     handle_naglowek_czesc = \
 184     handle_naglowek_listy = \
 185     handle_naglowek_osoba = \
 186     handle_naglowek_podrozdzial = \
 187     handle_naglowek_podrozdzial = \
 188     handle_naglowek_rozdzial = \
 189     handle_naglowek_rozdzial = \
 190     handle_naglowek_scena = \
 191     handle_nazwa_utworu = \
 192     handle_nota = \
 193     handle_osoba = \
 194     handle_pa = \
 195     handle_pe = \
 196     handle_podtytul = \
 197     handle_poezja_cyt = \
 198     handle_pr = \
 199     handle_pt = \
 200     handle_sekcja_asterysk = \
 201     handle_sekcja_swiatlo = \
 202     handle_separator_linia = \
 203     handle_slowo_obce = \
 204     handle_srodtytul = \
 205     handle_tytul_dziela = \
 206     handle_wyroznienie = \
 207     handle_texcommand
 208
 209     _handle_strofa = cmd("strofa")
 210
 211     def handle_strofa(self, element):
 212         self.options = {'strofa': True}
 213         return self._handle_strofa(element)
 214
 215     def handle_aktywnosc(self, element):
 216         self.activity_counter += 1
 217         self.options = {
 218             'activity': True,
 219             'activity_counter': self.activity_counter,
 220             'sub_gen': True,
 221         }
 222         submill = EduModule(self.options)
 223
 224         opis = submill.generate(element.xpath('opis')[0])
 225
 226         n = element.xpath('wskazowki')
 227         if n: wskazowki = submill.generate(n[0])
 228
 229         else: wskazowki = ''
 230         n = element.xpath('pomoce')
 231
 232         if n: pomoce = submill.generate(n[0])
 233         else: pomoce = ''
 234
 235         forma = ''.join(element.xpath('forma/text()'))
 236
 237         czas = ''.join(element.xpath('czas/text()'))
 238
 239         counter = self.activity_counter
 240
 241         return u"""
 242 <cmd name="noindent" />
 243 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 244 <cmd name="activityinfo"><parm>
 245  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 246  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 247  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 248 </parm></cmd>
 249
 250
 251 %(opis)s
 252
 253 %(wskazowki)s
 254 """ % locals()
 255
 256     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 257     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 258
 259     @ifoption(sub_gen=True)
 260     def handle_pomoce(self, _):
 261         return "Pomoce: ", ""
 262
 263     def handle_czas(self, *_):
 264         return
 265
 266     def handle_forma(self, *_):
 267         return
 268
 269     def handle_lista(self, element, attrs={}):
 270         if not element.findall("punkt"):
 271             return None
 272         ltype = element.attrib.get('typ', 'punkt')
 273         if ltype == 'slowniczek':
 274             surl = element.attrib.get('src', None)
 275             if surl is None:
 276                 # print '** missing src on <slowniczek>, setting default'
 277                 surl = 'http://edukacjamedialna.edu.pl/slowniczek'
 278             sxml = None
 279             if surl:
 280                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 281             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 282
 283         listcmd = {'num': 'enumerate',
 284                'punkt': 'itemize',
 285                'alfa': 'itemize',
 286                'slowniczek': 'itemize',
 287                'czytelnia': 'itemize'}[ltype]
 288
 289         return u'<env name="%s">' % listcmd, u'</env>'
 290
 291     def handle_punkt(self, element):
 292         return '<cmd name="item"/>', ''
 293
 294     def handle_cwiczenie(self, element):
 295         exercise_handlers = {
 296             'wybor': Wybor,
 297             'uporzadkuj': Uporzadkuj,
 298             'luki': Luki,
 299             'zastap': Zastap,
 300             'przyporzadkuj': Przyporzadkuj,
 301             'prawdafalsz': PrawdaFalsz
 302         }
 303
 304         typ = element.attrib['typ']
 305         self.exercise_counter += 1
 306         if not typ in exercise_handlers:
 307             return '(no handler)'
 308         self.options = {'exercise_counter': self.exercise_counter}
 309         handler = exercise_handlers[typ](self.options)
 310         return handler.generate(element)
 311
 312     # XXX this is copied from pyhtml.py, except for return and
 313     # should be refactored for no code duplication
 314     def handle_definiendum(self, element):
 315         nxt = element.getnext()
 316         definiens_s = ''
 317
 318         # let's pull definiens from another document
 319         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 320             sxml = self.options['slowniczek_xml']
 321             assert element.text != ''
 322             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 323             if defloc:
 324                 definiens = defloc[0].getnext()
 325                 if definiens.tag == 'definiens':
 326                     subgen = EduModule(self.options)
 327                     definiens_s = subgen.generate(definiens)
 328
 329         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 330
 331     def handle_definiens(self, element):
 332         return u"", u""
 333
 334     def handle_podpis(self, element):
 335         return u"""<env name="figure">""", u"</env>"
 336
 337     def handle_tabela(self, element):
 338         max_col = 0
 339         for w in element.xpath("wiersz"):
 340             ks = w.xpath("kol")
 341             if max_col < len(ks):
 342                 max_col = len(ks)
 343         self.options = {'columnts': max_col}
 344         # styling:
 345                 #        has_frames = int(element.attrib.get("ramki", "0"))
 346                 #        if has_frames: frames_c = "framed"
 347                 #        else: frames_c = ""
 348                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 349         return u'''
 350 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 351     ''' % ('l' * max_col), \
 352     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 353
 354     @escape(1)
 355     def handle_wiersz(self, element):
 356         return u"", u'<ctrl ch="\\"/>'
 357
 358     @escape(1)
 359     def handle_kol(self, element):
 360         if element.getnext() is not None:
 361             return u"", u'<spec cat="align" />'
 362         return u"", u""
 363
 364     def handle_link(self, element):
 365         if element.attrib.get('url'):
 366             url = element.attrib.get('url')
 367             if url == element.text:
 368                 return cmd('url')(self, element)
 369             else:
 370                 return cmd('href', parms=[element.attrib['url']])(self, element)
 371         else:
 372             return cmd('emph')(self, element)
 373
 374     def handle_obraz(self, element):
 375         frmt = self.options['format']
 376         name = element.attrib['nazwa'].strip()
 377         image = frmt.get_image(name.strip())
 378         img_path = "obraz/%s" % name.replace("_", "")
 379         frmt.attachments[img_path] = image
 380         return cmd("obraz", parms=[img_path])(self)
 381
 382     def handle_video(self, element):
 383         url = element.attrib.get('url')
 384         if not url:
 385             print '!! <video> missing url'
 386             return
 387         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 388         if not m:
 389             print '!! unknown <video> url scheme:', url
 390             return
 391         name = m.group(1)
 392         thumb = IOFile.from_string(urlopen
 393             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 394         img_path = "video/%s.jpg" % name.replace("_", "")
 395         self.options['format'].attachments[img_path] = thumb
 396         canon_url = "https://www.youtube.com/watch?v=%s" % name
 397         return cmd("video", parms=[img_path, canon_url])(self)
 398
 399
 400 class Exercise(EduModule):
 401     def __init__(self, *args, **kw):
 402         self.question_counter = 0
 403         super(Exercise, self).__init__(*args, **kw)
 404
 405     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 406
 407     def handle_cwiczenie(self, element):
 408         self.options = {
 409             'exercise': element.attrib['typ'],
 410             'sub_gen': True,
 411         }
 412         self.question_counter = 0
 413         self.piece_counter = 0
 414
 415         header = etree.Element("parm")
 416         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 417         header_cmd.append(header)
 418         header.text = u"Zadanie %d." % self.options['exercise_counter']
 419
 420         pre = etree.tostring(header_cmd, encoding=unicode)
 421         post = u""
 422         # Add a single <pytanie> tag if it's not there
 423         if not element.xpath(".//pytanie"):
 424             qpre, qpost = self.handle_pytanie(element)
 425             pre = pre + qpre
 426             post = qpost + post
 427         return pre, post
 428
 429     def handle_pytanie(self, element):
 430         """This will handle <cwiczenie> element, when there is no <pytanie>
 431         """
 432         self.question_counter += 1
 433         self.piece_counter = 0
 434         pre = post = u""
 435         if self.options['teacher'] and element.attrib.get('rozw'):
 436             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 437         return pre, post
 438
 439     def handle_punkt(self, element):
 440         pre, post = super(Exercise, self).handle_punkt(element)
 441         if self.options['teacher'] and element.attrib.get('rozw'):
 442             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 443         return pre, post
 444
 445     def solution_header(self):
 446         par = etree.Element("cmd", name="par")
 447         parm = etree.Element("parm")
 448         parm.text = u"Rozwiązanie:"
 449         par.append(parm)
 450         return etree.tostring(par)
 451
 452     def explicit_solution(self):
 453         if self.options['solution']:
 454             par = etree.Element("cmd", name="par")
 455             parm = etree.Element("parm")
 456             parm.text = self.options['solution']
 457             par.append(parm)
 458             return self.solution_header() + etree.tostring(par)
 459
 460
 461
 462 class Wybor(Exercise):
 463     def handle_cwiczenie(self, element):
 464         pre, post = super(Wybor, self).handle_cwiczenie(element)
 465         is_single_choice = True
 466         pytania = element.xpath(".//pytanie")
 467         if not pytania:
 468             pytania = [element]
 469         for p in pytania:
 470             solutions = re.split(r"[, ]+", p.attrib['rozw'])
 471             if len(solutions) != 1:
 472                 is_single_choice = False
 473                 break
 474             choices = p.xpath(".//*[@nazwa]")
 475             uniq = set()
 476             for n in choices: uniq.add(n.attrib['nazwa'])
 477             if len(choices) != len(uniq):
 478                 is_single_choice = False
 479                 break
 480
 481         self.options = {'single': is_single_choice}
 482         return pre, post
 483
 484     def handle_punkt(self, element):
 485         if self.options['exercise'] and element.attrib.get('nazwa', None):
 486             cmd = 'radio' if self.options['single'] else 'checkbox'
 487             return u'<cmd name="%s"/>' % cmd, ''
 488         else:
 489             return super(Wybor, self).handle_punkt(element)
 490
 491
 492 class Uporzadkuj(Exercise):
 493     def handle_pytanie(self, element):
 494         order_items = element.xpath(".//punkt/@rozw")
 495         return super(Uporzadkuj, self).handle_pytanie(element)
 496
 497
 498 class Przyporzadkuj(Exercise):
 499     def handle_lista(self, lista):
 500         header = etree.Element("parm")
 501         header_cmd = etree.Element("cmd", name="par")
 502         header_cmd.append(header)
 503         if 'nazwa' in lista.attrib:
 504             header.text = u"Kategorie:"
 505         elif 'cel' in lista.attrib:
 506             header.text = u"Elementy do przyporządkowania:"
 507         else:
 508             header.text = u"Lista:"
 509         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 510         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 511         return pre, post
 512
 513
 514 class Luki(Exercise):
 515     def find_pieces(self, question):
 516         return question.xpath(".//luka")
 517
 518     def solution(self, piece):
 519         piece = deepcopy(piece)
 520         piece.tail = None
 521         sub = EduModule()
 522         return sub.generate(piece)
 523
 524     def handle_pytanie(self, element):
 525         qpre, qpost = super(Luki, self).handle_pytanie(element)
 526
 527         luki = self.find_pieces(element)
 528         random.shuffle(luki)
 529         self.words = u"<env name='itemize'>%s</env>" % (
 530             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 531         )
 532         return qpre, qpost
 533
 534     def handle_opis(self, element):
 535         return '', self.words
 536
 537     def handle_luka(self, element):
 538         luka = "_" * 10
 539         if self.options['teacher']:
 540             piece = deepcopy(element)
 541             piece.tail = None
 542             sub = EduModule()
 543             text = sub.generate(piece)
 544             luka += u" [rozwiązanie: %s]" % text
 545         return luka
 546
 547
 548 class Zastap(Luki):
 549     def find_pieces(self, question):
 550         return question.xpath(".//zastap")
 551
 552     def solution(self, piece):
 553         return piece.attrib['rozw']
 554
 555     def list_header(self):
 556         return u"Elementy do wstawienia"
 557
 558     def handle_zastap(self, element):
 559         piece = deepcopy(element)
 560         piece.tail = None
 561         sub = EduModule()
 562         text = sub.generate(piece)
 563         if self.options['teacher'] and element.attrib.get('rozw'):
 564             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 565         return text
 566
 567
 568 class PrawdaFalsz(Exercise):
 569     def handle_punkt(self, element):
 570         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 571         if 'rozw' in element.attrib:
 572             post += u" [Prawda/Fałsz]"
 573         return pre, post
 574
 575
 576
 577 def fix_lists(tree):
 578     lists = tree.xpath(".//lista")
 579     for l in lists:
 580         if l.text:
 581             p = l.getprevious()
 582             if p is not None:
 583                 if p.tail is None: p.tail = ''
 584                 p.tail += l.text
 585             else:
 586                 p = l.getparent()
 587                 if p.text is None: p.text = ''
 588                 p.text += l.text
 589             l.text = ''
 590     return tree
 591
 592
 593 class EduModulePDFFormat(PDFFormat):
 594     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 595
 596     def get_texml(self):
 597         self.attachments = {}
 598         edumod = EduModule({
 599             "wldoc": self.wldoc,
 600             "format": self,
 601             "teacher": self.customization.get('teacher'),
 602         })
 603         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 604
 605         open("/tmp/texml.xml", "w").write(texml)
 606         return texml
 607
 608     def get_tex_dir(self):
 609         temp = super(EduModulePDFFormat, self).get_tex_dir()
 610         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 611         for name, iofile in self.attachments.items():
 612             iofile.save_as(os.path.join(temp, name))
 613         return temp
 614
 615     def get_image(self, name):
 616         return self.wldoc.source.attachments[name]
 617