librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 from copy import deepcopy
  14 import os
  15 import os.path
  16 import shutil
  17 from StringIO import StringIO
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 import re
  20 import random
  21 from copy import deepcopy
  22 from subprocess import call, PIPE
  23 from urllib2 import urlopen
  24
  25 from Texml.processor import process
  26 from lxml import etree
  27 from lxml.etree import XMLSyntaxError, XSLTApplyError
  28
  29 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  30 from librarian.dcparser import Person
  31 from librarian.parser import WLDocument
  32 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  33 from librarian import functions
  34 from pdf import PDFFormat
  35
  36
  37
  38 def escape(really):
  39     def deco(f):
  40         def _wrap(*args, **kw):
  41             value = f(*args, **kw)
  42
  43             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  44             postfix = u'</TeXML>'
  45             if isinstance(value, list):
  46                 import pdb; pdb.set_trace()
  47             if isinstance(value, tuple):
  48                 return prefix + value[0], value[1] + postfix
  49             else:
  50                 return prefix + value + postfix
  51         return _wrap
  52     return deco
  53
  54
  55 def cmd(name, parms=None):
  56     def wrap(self, element=None):
  57         pre, post = tag_open_close('cmd', name=name)
  58
  59         if parms:
  60             for parm in parms:
  61                 e = etree.Element("parm")
  62                 e.text = parm
  63                 pre += etree.tostring(e)
  64         if element is not None:
  65             pre += "<parm>"
  66             post = "</parm>" + post
  67             return pre, post
  68         else:
  69             return pre + post
  70     return wrap
  71
  72
  73 def mark_alien_characters(text):
  74     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  75     return text
  76
  77
  78 class EduModule(Xmill):
  79     def __init__(self, options=None):
  80         super(EduModule, self).__init__(options)
  81         self.activity_counter = 0
  82         self.exercise_counter = 0
  83
  84         def swap_endlines(txt):
  85             if self.options['strofa']:
  86                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  87             return txt
  88         self.register_text_filter(functions.substitute_entities)
  89         self.register_text_filter(mark_alien_characters)
  90         self.register_text_filter(swap_endlines)
  91
  92     def get_dc(self, element, dc_field, single=False):
  93         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  94         if single:
  95             return values[0]
  96         return values
  97
  98     def handle_rdf__RDF(self, _):
  99         "skip metadata in generation"
 100         return
 101
 102     @escape(True)
 103     def get_rightsinfo(self, element):
 104         rights_lic = self.get_dc(element, 'rights.license', True)
 105         return u'<cmd name="rightsinfostr">' + \
 106           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
 107           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
 108           u'</cmd>'
 109
 110     @escape(True)
 111     def get_authors(self, element):
 112         authors = self.get_dc(element, 'creator.expert') + \
 113           self.get_dc(element, 'creator.scenario') + \
 114           self.get_dc(element, 'creator.textbook')
 115         return u', '.join(authors)
 116
 117     @escape(1)
 118     def get_title(self, element):
 119         return self.get_dc(element, 'title', True)
 120
 121     def handle_utwor(self, element):
 122         lines = [
 123             u'''
 124     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 125         <TeXML escape="0">
 126         \\documentclass[%s]{wl}
 127         \\usepackage{style}''' % self.options['customization_str'],
 128     self.options['has_cover'] and '\usepackage{makecover}',
 129     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 130     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 131     (self.options['morefloats'] == 'none' and
 132      u'''\\IfFileExists{morefloats.sty}{
 133             \\usepackage{morefloats}
 134         }{}'''),
 135     u'''\\def\\authors{%s}''' % self.get_authors(element),
 136     u'''\\author{\\authors}''',
 137     u'''\\title{%s}''' % self.get_title(element),
 138     u'''\\def\\bookurl{%s}''' % self.get_dc(element, 'identifier.url', True),
 139     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 140     u'</TeXML>']
 141
 142         return u"".join(filter(None, lines)), u'</TeXML>'
 143
 144
 145     @escape(1)
 146     def handle_powiesc(self, element):
 147         return u"""
 148     <env name="document">
 149     <cmd name="maketitle"/>
 150     """, """</env>"""
 151
 152     @escape(1)
 153     def handle_texcommand(self, element):
 154         cmd = functions.texcommand(element.tag)
 155         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 156
 157     handle_akap = \
 158     handle_akap = \
 159     handle_akap_cd = \
 160     handle_akap_cd = \
 161     handle_akap_dialog = \
 162     handle_akap_dialog = \
 163     handle_autor_utworu = \
 164     handle_dedykacja = \
 165     handle_didaskalia = \
 166     handle_didask_tekst = \
 167     handle_dlugi_cytat = \
 168     handle_dzielo_nadrzedne = \
 169     handle_lista_osoba = \
 170     handle_mat = \
 171     handle_miejsce_czas = \
 172     handle_motto = \
 173     handle_motto_podpis = \
 174     handle_naglowek_akt = \
 175     handle_naglowek_czesc = \
 176     handle_naglowek_listy = \
 177     handle_naglowek_osoba = \
 178     handle_naglowek_podrozdzial = \
 179     handle_naglowek_podrozdzial = \
 180     handle_naglowek_rozdzial = \
 181     handle_naglowek_rozdzial = \
 182     handle_naglowek_scena = \
 183     handle_nazwa_utworu = \
 184     handle_nota = \
 185     handle_osoba = \
 186     handle_pa = \
 187     handle_pe = \
 188     handle_podtytul = \
 189     handle_poezja_cyt = \
 190     handle_pr = \
 191     handle_pt = \
 192     handle_sekcja_asterysk = \
 193     handle_sekcja_swiatlo = \
 194     handle_separator_linia = \
 195     handle_slowo_obce = \
 196     handle_srodtytul = \
 197     handle_tytul_dziela = \
 198     handle_wyroznienie = \
 199     handle_texcommand
 200
 201     _handle_strofa = cmd("strofa")
 202
 203     def handle_strofa(self, element):
 204         self.options = {'strofa': True}
 205         return self._handle_strofa(element)
 206
 207     def handle_aktywnosc(self, element):
 208         self.activity_counter += 1
 209         self.options = {
 210             'activity': True,
 211             'activity_counter': self.activity_counter,
 212             'sub_gen': True,
 213         }
 214         submill = EduModule(self.options)
 215
 216         opis = submill.generate(element.xpath('opis')[0])
 217
 218         n = element.xpath('wskazowki')
 219         if n: wskazowki = submill.generate(n[0])
 220
 221         else: wskazowki = ''
 222         n = element.xpath('pomoce')
 223
 224         if n: pomoce = submill.generate(n[0])
 225         else: pomoce = ''
 226
 227         forma = ''.join(element.xpath('forma/text()'))
 228
 229         czas = ''.join(element.xpath('czas/text()'))
 230
 231         counter = self.activity_counter
 232
 233         return u"""
 234
 235 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 236 <cmd name="activityinfo"><parm>
 237  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 238  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 239  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 240 </parm></cmd>
 241
 242
 243 %(opis)s
 244
 245 %(wskazowki)s
 246 """ % locals()
 247
 248     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 249     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 250
 251     @ifoption(sub_gen=True)
 252     def handle_pomoce(self, _):
 253         return "Pomoce: ", ""
 254
 255     def handle_czas(self, *_):
 256         return
 257
 258     def handle_forma(self, *_):
 259         return
 260
 261     def handle_lista(self, element, attrs={}):
 262         if not element.findall("punkt"):
 263             return None
 264         ltype = element.attrib.get('typ', 'punkt')
 265         if ltype == 'slowniczek':
 266             surl = element.attrib.get('src', None)
 267             if surl is None:
 268                 # print '** missing src on <slowniczek>, setting default'
 269                 surl = 'http://edukacjamedialna.edu.pl/slowniczek'
 270             sxml = None
 271             if surl:
 272                 sxml = etree.fromstring(self.options['provider'].by_uri(surl).get_string())
 273             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 274
 275         listcmd = {'num': 'enumerate',
 276                'punkt': 'itemize',
 277                'alfa': 'itemize',
 278                'slowniczek': 'itemize',
 279                'czytelnia': 'itemize'}[ltype]
 280
 281         return u'<env name="%s">' % listcmd, u'</env>'
 282
 283     def handle_punkt(self, element):
 284         return '<cmd name="item"/>', ''
 285
 286     def handle_cwiczenie(self, element):
 287         exercise_handlers = {
 288             'wybor': Wybor,
 289             'uporzadkuj': Uporzadkuj,
 290             'luki': Luki,
 291             'zastap': Zastap,
 292             'przyporzadkuj': Przyporzadkuj,
 293             'prawdafalsz': PrawdaFalsz
 294         }
 295
 296         typ = element.attrib['typ']
 297         self.exercise_counter += 1
 298         if not typ in exercise_handlers:
 299             return '(no handler)'
 300         self.options = {'exercise_counter': self.exercise_counter}
 301         handler = exercise_handlers[typ](self.options)
 302         return handler.generate(element)
 303
 304     # XXX this is copied from pyhtml.py, except for return and
 305     # should be refactored for no code duplication
 306     def handle_definiendum(self, element):
 307         nxt = element.getnext()
 308         definiens_s = ''
 309
 310         # let's pull definiens from another document
 311         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 312             sxml = self.options['slowniczek_xml']
 313             assert element.text != ''
 314             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 315             if defloc:
 316                 definiens = defloc[0].getnext()
 317                 if definiens.tag == 'definiens':
 318                     subgen = EduModule(self.options)
 319                     definiens_s = subgen.generate(definiens)
 320
 321         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 322
 323     def handle_definiens(self, element):
 324         return u"", u""
 325
 326     def handle_podpis(self, element):
 327         return u"""<env name="figure">""", u"</env>"
 328
 329     def handle_tabela(self, element):
 330         max_col = 0
 331         for w in element.xpath("wiersz"):
 332             ks = w.xpath("kol")
 333             if max_col < len(ks):
 334                 max_col = len(ks)
 335         self.options = {'columnts': max_col}
 336         # styling:
 337                 #        has_frames = int(element.attrib.get("ramki", "0"))
 338                 #        if has_frames: frames_c = "framed"
 339                 #        else: frames_c = ""
 340                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 341         return u'''
 342 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 343     ''' % ('l' * max_col), \
 344     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 345
 346     @escape(1)
 347     def handle_wiersz(self, element):
 348         return u"", u'<ctrl ch="\\"/>'
 349
 350     @escape(1)
 351     def handle_kol(self, element):
 352         if element.getnext() is not None:
 353             return u"", u'<spec cat="align" />'
 354         return u"", u""
 355
 356     def handle_link(self, element):
 357         if element.attrib.get('url'):
 358             url = element.attrib.get('url')
 359             if url == element.text:
 360                 return cmd('url')(self, element)
 361             else:
 362                 return cmd('href', parms=[element.attrib['url']])(self, element)
 363         else:
 364             return cmd('em')(self, element)
 365
 366     def handle_obraz(self, element):
 367         frmt = self.options['format']
 368         name = element.attrib['nazwa'].strip()
 369         image = frmt.get_image(name.strip())
 370         img_path = "obraz/%s" % name.replace("_", "")
 371         frmt.attachments[img_path] = image
 372         return cmd("obraz", parms=[img_path])(self)
 373
 374     def handle_video(self, element):
 375         url = element.attrib.get('url')
 376         if not url:
 377             print '!! <video> missing url'
 378             return
 379         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 380         if not m:
 381             print '!! unknown <video> url scheme:', url
 382             return
 383         name = m.group(1)
 384         thumb = IOFile.from_string(urlopen
 385             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 386         img_path = "video/%s.jpg" % name.replace("_", "")
 387         self.options['format'].attachments[img_path] = thumb
 388         canon_url = "https://www.youtube.com/watch?v=%s" % name
 389         return cmd("video", parms=[img_path, canon_url])(self)
 390
 391
 392 class Exercise(EduModule):
 393     def __init__(self, *args, **kw):
 394         self.question_counter = 0
 395         super(Exercise, self).__init__(*args, **kw)
 396
 397     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 398
 399     def handle_cwiczenie(self, element):
 400         self.options = {
 401             'exercise': element.attrib['typ'],
 402             'sub_gen': True,
 403         }
 404         self.question_counter = 0
 405         self.piece_counter = 0
 406
 407         header = etree.Element("parm")
 408         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 409         header_cmd.append(header)
 410         header.text = u"Zadanie %d." % self.options['exercise_counter']
 411
 412         pre = etree.tostring(header_cmd, encoding=unicode)
 413         post = u""
 414         # Add a single <pytanie> tag if it's not there
 415         if not element.xpath(".//pytanie"):
 416             qpre, qpost = self.handle_pytanie(element)
 417             pre = pre + qpre
 418             post = qpost + post
 419         return pre, post
 420
 421     def handle_pytanie(self, element):
 422         """This will handle <cwiczenie> element, when there is no <pytanie>
 423         """
 424         self.question_counter += 1
 425         self.piece_counter = 0
 426         pre = post = u""
 427         if self.options['teacher'] and element.attrib.get('rozw'):
 428             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 429         return pre, post
 430
 431     def handle_punkt(self, element):
 432         pre, post = super(Exercise, self).handle_punkt(element)
 433         if self.options['teacher'] and element.attrib.get('rozw'):
 434             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 435         return pre, post
 436
 437     def solution_header(self):
 438         par = etree.Element("cmd", name="par")
 439         parm = etree.Element("parm")
 440         parm.text = u"Rozwiązanie:"
 441         par.append(parm)
 442         return etree.tostring(par)
 443
 444     def explicit_solution(self):
 445         if self.options['solution']:
 446             par = etree.Element("cmd", name="par")
 447             parm = etree.Element("parm")
 448             parm.text = self.options['solution']
 449             par.append(parm)
 450             return self.solution_header() + etree.tostring(par)
 451
 452
 453
 454 class Wybor(Exercise):
 455     def handle_cwiczenie(self, element):
 456         pre, post = super(Wybor, self).handle_cwiczenie(element)
 457         is_single_choice = True
 458         pytania = element.xpath(".//pytanie")
 459         if not pytania:
 460             pytania = [element]
 461         for p in pytania:
 462             solutions = re.split(r"[, ]+", p.attrib['rozw'])
 463             if len(solutions) != 1:
 464                 is_single_choice = False
 465                 break
 466             choices = p.xpath(".//*[@nazwa]")
 467             uniq = set()
 468             for n in choices: uniq.add(n.attrib['nazwa'])
 469             if len(choices) != len(uniq):
 470                 is_single_choice = False
 471                 break
 472
 473         self.options = {'single': is_single_choice}
 474         return pre, post
 475
 476     def handle_punkt(self, element):
 477         if self.options['exercise'] and element.attrib.get('nazwa', None):
 478             cmd = 'radio' if self.options['single'] else 'checkbox'
 479             return u'<cmd name="%s"/>' % cmd, ''
 480         else:
 481             return super(Wybor, self).handle_punkt(element)
 482
 483
 484 class Uporzadkuj(Exercise):
 485     def handle_pytanie(self, element):
 486         order_items = element.xpath(".//punkt/@rozw")
 487         return super(Uporzadkuj, self).handle_pytanie(element)
 488
 489
 490 class Przyporzadkuj(Exercise):
 491     def handle_lista(self, lista):
 492         header = etree.Element("parm")
 493         header_cmd = etree.Element("cmd", name="par")
 494         header_cmd.append(header)
 495         if 'nazwa' in lista.attrib:
 496             header.text = u"Kategorie:"
 497         elif 'cel' in lista.attrib:
 498             header.text = u"Elementy do przyporządkowania:"
 499         else:
 500             header.text = u"Lista:"
 501         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 502         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 503         return pre, post
 504
 505
 506 class Luki(Exercise):
 507     def find_pieces(self, question):
 508         return question.xpath(".//luka")
 509
 510     def solution(self, piece):
 511         piece = deepcopy(piece)
 512         piece.tail = None
 513         sub = EduModule()
 514         return sub.generate(piece)
 515
 516     def handle_pytanie(self, element):
 517         qpre, qpost = super(Luki, self).handle_pytanie(element)
 518
 519         luki = self.find_pieces(element)
 520         random.shuffle(luki)
 521         self.words = u"<env name='itemize'>%s</env>" % (
 522             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 523         )
 524         return qpre, qpost
 525
 526     def handle_opis(self, element):
 527         return '', self.words
 528
 529     def handle_luka(self, element):
 530         luka = "_" * 10
 531         if self.options['teacher']:
 532             piece = deepcopy(element)
 533             piece.tail = None
 534             sub = EduModule()
 535             text = sub.generate(piece)
 536             luka += u" [rozwiązanie: %s]" % text
 537         return luka
 538
 539
 540 class Zastap(Luki):
 541     def find_pieces(self, question):
 542         return question.xpath(".//zastap")
 543
 544     def solution(self, piece):
 545         return piece.attrib['rozw']
 546
 547     def list_header(self):
 548         return u"Elementy do wstawienia"
 549
 550     def handle_zastap(self, element):
 551         piece = deepcopy(element)
 552         piece.tail = None
 553         sub = EduModule()
 554         text = sub.generate(piece)
 555         if self.options['teacher'] and element.attrib.get('rozw'):
 556             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 557         return text
 558
 559
 560 class PrawdaFalsz(Exercise):
 561     def handle_punkt(self, element):
 562         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 563         if 'rozw' in element.attrib:
 564             post += u" [Prawda/Fałsz]"
 565         return pre, post
 566
 567
 568
 569 def fix_lists(tree):
 570     lists = tree.xpath(".//lista")
 571     for l in lists:
 572         if l.text:
 573             p = l.getprevious()
 574             if p is not None:
 575                 if p.tail is None: p.tail = ''
 576                 p.tail += l.text
 577             else:
 578                 p = l.getparent()
 579                 if p.text is None: p.text = ''
 580                 p.text += l.text
 581             l.text = ''
 582     return tree
 583
 584
 585 class EduModulePDFFormat(PDFFormat):
 586     def get_texml(self):
 587         self.attachments = {}
 588         edumod = EduModule({
 589             'provider': self.wldoc.provider,
 590             "format": self,
 591             "teacher": self.customization.get('teacher'),
 592         })
 593         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 594
 595         open("/tmp/texml.xml", "w").write(texml)
 596         return texml
 597
 598     def get_tex_dir(self):
 599         temp = super(EduModulePDFFormat, self).get_tex_dir()
 600         for name, iofile in self.attachments.items():
 601             iofile.save_as(os.path.join(temp, name))
 602         return temp
 603
 604     def get_image(self, name):
 605         return self.wldoc.source.attachments[name]
 606