librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None):
  70         super(EduModule, self).__init__(options)
  71         self.activity_counter = 0
  72         self.exercise_counter = 0
  73
  74         def swap_endlines(txt):
  75             if self.options['strofa']:
  76                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  77             return txt
  78         self.register_text_filter(swap_endlines)
  79         self.register_text_filter(functions.substitute_entities)
  80         self.register_text_filter(mark_alien_characters)
  81
  82     def get_dc(self, element, dc_field, single=False):
  83         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  84         if single:
  85             return values[0]
  86         return values
  87
  88     def handle_rdf__RDF(self, _):
  89         "skip metadata in generation"
  90         return
  91
  92     @escape(True)
  93     def get_rightsinfo(self, element):
  94         rights_lic = self.get_dc(element, 'rights.license', True)
  95         return u'<cmd name="rightsinfostr">' + \
  96           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  97           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  98           u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors)
 110
 111     @escape(1)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     def handle_utwor(self, element):
 116         lines = [
 117             u'''
 118     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 119         <TeXML escape="0">
 120         \\documentclass[%s]{wl}
 121         \\usepackage{style}''' % self.options['customization_str'],
 122     self.options['has_cover'] and '\usepackage{makecover}',
 123     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 124     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 125     (self.options['morefloats'] == 'none' and
 126      u'''\\IfFileExists{morefloats.sty}{
 127             \\usepackage{morefloats}
 128         }{}'''),
 129     u'''\\def\\authors{%s}''' % self.get_authors(element),
 130     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 131     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 132     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 133
 134     u'''\\author{\\authors}''',
 135     u'''\\title{%s}''' % self.get_title(element),
 136     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 137     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 138     u'</TeXML>']
 139
 140         return u"".join(filter(None, lines)), u'</TeXML>'
 141
 142
 143     @escape(1)
 144     def handle_powiesc(self, element):
 145         return u"""
 146     <env name="document">
 147     <cmd name="maketitle"/>
 148     """, """<cmd name="editorialsection" /></env>"""
 149
 150     @escape(1)
 151     def handle_texcommand(self, element):
 152         cmd = functions.texcommand(element.tag)
 153         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 154
 155     handle_akap = \
 156     handle_akap = \
 157     handle_akap_cd = \
 158     handle_akap_cd = \
 159     handle_akap_dialog = \
 160     handle_akap_dialog = \
 161     handle_autor_utworu = \
 162     handle_dedykacja = \
 163     handle_didaskalia = \
 164     handle_didask_tekst = \
 165     handle_dlugi_cytat = \
 166     handle_dzielo_nadrzedne = \
 167     handle_lista_osoba = \
 168     handle_mat = \
 169     handle_miejsce_czas = \
 170     handle_motto = \
 171     handle_motto_podpis = \
 172     handle_naglowek_akt = \
 173     handle_naglowek_czesc = \
 174     handle_naglowek_listy = \
 175     handle_naglowek_osoba = \
 176     handle_naglowek_podrozdzial = \
 177     handle_naglowek_podrozdzial = \
 178     handle_naglowek_rozdzial = \
 179     handle_naglowek_rozdzial = \
 180     handle_naglowek_scena = \
 181     handle_nazwa_utworu = \
 182     handle_nota = \
 183     handle_osoba = \
 184     handle_pa = \
 185     handle_pe = \
 186     handle_podtytul = \
 187     handle_poezja_cyt = \
 188     handle_pr = \
 189     handle_pt = \
 190     handle_sekcja_asterysk = \
 191     handle_sekcja_swiatlo = \
 192     handle_separator_linia = \
 193     handle_slowo_obce = \
 194     handle_srodtytul = \
 195     handle_tytul_dziela = \
 196     handle_wyroznienie = \
 197     handle_dywiz = \
 198     handle_texcommand
 199
 200     def handle_uwaga(self, _e):
 201         return None
 202     def handle_extra(self, _e):
 203         return None
 204
 205     def handle_nbsp(self, _e):
 206         return '<spec cat="tilde" />'
 207
 208     _handle_strofa = cmd("strofa")
 209
 210     def handle_strofa(self, element):
 211         self.options = {'strofa': True}
 212         return self._handle_strofa(element)
 213
 214     def handle_aktywnosc(self, element):
 215         self.activity_counter += 1
 216         self.options = {
 217             'activity': True,
 218             'activity_counter': self.activity_counter,
 219             'sub_gen': True,
 220         }
 221         submill = EduModule(self.options)
 222
 223         if element.xpath('opis'):
 224             opis = submill.generate(element.xpath('opis')[0])
 225         else:
 226             opis = ''
 227
 228         n = element.xpath('wskazowki')
 229         if n: wskazowki = submill.generate(n[0])
 230
 231         else: wskazowki = ''
 232         n = element.xpath('pomoce')
 233
 234         if n: pomoce = submill.generate(n[0])
 235         else: pomoce = ''
 236
 237         forma = ''.join(element.xpath('forma/text()'))
 238
 239         czas = ''.join(element.xpath('czas/text()'))
 240
 241         counter = self.activity_counter
 242
 243         return u"""
 244 <cmd name="noindent" />
 245 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 246 <cmd name="activityinfo"><parm>
 247  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 248  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 249  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 250 </parm></cmd>
 251
 252
 253 %(opis)s
 254
 255 %(wskazowki)s
 256 """ % locals()
 257
 258     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 259     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 260
 261     @ifoption(sub_gen=True)
 262     def handle_pomoce(self, _):
 263         return "Pomoce: ", ""
 264
 265     def handle_czas(self, *_):
 266         return
 267
 268     def handle_forma(self, *_):
 269         return
 270
 271     def handle_lista(self, element, attrs={}):
 272         ltype = element.attrib.get('typ', 'punkt')
 273         if not element.findall("punkt"):
 274             if ltype == 'czytelnia':
 275                 return 'W przygotowaniu.'
 276             else:
 277                 return None
 278         if ltype == 'slowniczek':
 279             surl = element.attrib.get('src', None)
 280             if surl is None:
 281                 # print '** missing src on <slowniczek>, setting default'
 282                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 283             sxml = None
 284             if surl:
 285                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 286             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 287
 288         listcmd = {'num': 'enumerate',
 289                'punkt': 'itemize',
 290                'alfa': 'itemize',
 291                'slowniczek': 'itemize',
 292                'czytelnia': 'itemize'}[ltype]
 293
 294         return u'<env name="%s">' % listcmd, u'</env>'
 295
 296     def handle_punkt(self, element):
 297         return '<cmd name="item"/>', ''
 298
 299     def handle_cwiczenie(self, element):
 300         exercise_handlers = {
 301             'wybor': Wybor,
 302             'uporzadkuj': Uporzadkuj,
 303             'luki': Luki,
 304             'zastap': Zastap,
 305             'przyporzadkuj': Przyporzadkuj,
 306             'prawdafalsz': PrawdaFalsz
 307         }
 308
 309         typ = element.attrib['typ']
 310         self.exercise_counter += 1
 311         if not typ in exercise_handlers:
 312             return '(no handler)'
 313         self.options = {'exercise_counter': self.exercise_counter}
 314         handler = exercise_handlers[typ](self.options)
 315         return handler.generate(element)
 316
 317     # XXX this is copied from pyhtml.py, except for return and
 318     # should be refactored for no code duplication
 319     def handle_definiendum(self, element):
 320         nxt = element.getnext()
 321         definiens_s = ''
 322
 323         # let's pull definiens from another document
 324         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 325             sxml = self.options['slowniczek_xml']
 326             assert element.text != ''
 327             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 328             if defloc:
 329                 definiens = defloc[0].getnext()
 330                 if definiens.tag == 'definiens':
 331                     subgen = EduModule(self.options)
 332                     definiens_s = subgen.generate(definiens)
 333
 334         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 335
 336     def handle_definiens(self, element):
 337         return u"", u""
 338
 339     def handle_podpis(self, element):
 340         return u"""<env name="figure">""", u"</env>"
 341
 342     def handle_tabela(self, element):
 343         max_col = 0
 344         for w in element.xpath("wiersz"):
 345             ks = w.xpath("kol")
 346             if max_col < len(ks):
 347                 max_col = len(ks)
 348         self.options = {'columnts': max_col}
 349         # styling:
 350                 #        has_frames = int(element.attrib.get("ramki", "0"))
 351                 #        if has_frames: frames_c = "framed"
 352                 #        else: frames_c = ""
 353                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 354         return u'''
 355 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 356     ''' % ('l' * max_col), \
 357     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 358
 359     @escape(1)
 360     def handle_wiersz(self, element):
 361         return u"", u'<ctrl ch="\\"/>'
 362
 363     @escape(1)
 364     def handle_kol(self, element):
 365         if element.getnext() is not None:
 366             return u"", u'<spec cat="align" />'
 367         return u"", u""
 368
 369     def handle_link(self, element):
 370         if element.attrib.get('url'):
 371             url = element.attrib.get('url')
 372             if url == element.text:
 373                 return cmd('url')(self, element)
 374             else:
 375                 return cmd('href', parms=[element.attrib['url']])(self, element)
 376         else:
 377             return cmd('emph')(self, element)
 378
 379     def handle_obraz(self, element):
 380         frmt = self.options['format']
 381         name = element.attrib.get('nazwa', '').strip()
 382         image = frmt.get_image(name.strip())
 383         img_path = "obraz/%s" % name.replace("_", "")
 384         frmt.attachments[img_path] = image
 385         return cmd("obraz", parms=[img_path])(self)
 386
 387     def handle_video(self, element):
 388         url = element.attrib.get('url')
 389         if not url:
 390             print '!! <video> missing url'
 391             return
 392         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 393         if not m:
 394             print '!! unknown <video> url scheme:', url
 395             return
 396         name = m.group(1)
 397         thumb = IOFile.from_string(urlopen
 398             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 399         img_path = "video/%s.jpg" % name.replace("_", "")
 400         self.options['format'].attachments[img_path] = thumb
 401         canon_url = "https://www.youtube.com/watch?v=%s" % name
 402         return cmd("video", parms=[img_path, canon_url])(self)
 403
 404
 405 class Exercise(EduModule):
 406     def __init__(self, *args, **kw):
 407         self.question_counter = 0
 408         super(Exercise, self).__init__(*args, **kw)
 409
 410     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 411
 412     def handle_cwiczenie(self, element):
 413         self.options = {
 414             'exercise': element.attrib['typ'],
 415             'sub_gen': True,
 416         }
 417         self.question_counter = 0
 418         self.piece_counter = 0
 419
 420         header = etree.Element("parm")
 421         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 422         header_cmd.append(header)
 423         header.text = u"Zadanie %d." % self.options['exercise_counter']
 424
 425         pre = etree.tostring(header_cmd, encoding=unicode)
 426         post = u""
 427         # Add a single <pytanie> tag if it's not there
 428         if not element.xpath(".//pytanie"):
 429             qpre, qpost = self.handle_pytanie(element)
 430             pre = pre + qpre
 431             post = qpost + post
 432         return pre, post
 433
 434     def handle_pytanie(self, element):
 435         """This will handle <cwiczenie> element, when there is no <pytanie>
 436         """
 437         self.question_counter += 1
 438         self.piece_counter = 0
 439         pre = post = u""
 440         if self.options['teacher'] and element.attrib.get('rozw'):
 441             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 442         return pre, post
 443
 444     def handle_punkt(self, element):
 445         pre, post = super(Exercise, self).handle_punkt(element)
 446         if self.options['teacher'] and element.attrib.get('rozw'):
 447             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 448         return pre, post
 449
 450     def solution_header(self):
 451         par = etree.Element("cmd", name="par")
 452         parm = etree.Element("parm")
 453         parm.text = u"Rozwiązanie:"
 454         par.append(parm)
 455         return etree.tostring(par)
 456
 457     def explicit_solution(self):
 458         if self.options['solution']:
 459             par = etree.Element("cmd", name="par")
 460             parm = etree.Element("parm")
 461             parm.text = self.options['solution']
 462             par.append(parm)
 463             return self.solution_header() + etree.tostring(par)
 464
 465
 466
 467 class Wybor(Exercise):
 468     def handle_cwiczenie(self, element):
 469         pre, post = super(Wybor, self).handle_cwiczenie(element)
 470         is_single_choice = True
 471         pytania = element.xpath(".//pytanie")
 472         if not pytania:
 473             pytania = [element]
 474         for p in pytania:
 475             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 476             if len(solutions) != 1:
 477                 is_single_choice = False
 478                 break
 479             choices = p.xpath(".//*[@nazwa]")
 480             uniq = set()
 481             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 482             if len(choices) != len(uniq):
 483                 is_single_choice = False
 484                 break
 485
 486         self.options = {'single': is_single_choice}
 487         return pre, post
 488
 489     def handle_punkt(self, element):
 490         if self.options['exercise'] and element.attrib.get('nazwa', None):
 491             cmd = 'radio' if self.options['single'] else 'checkbox'
 492             return u'<cmd name="%s"/>' % cmd, ''
 493         else:
 494             return super(Wybor, self).handle_punkt(element)
 495
 496
 497 class Uporzadkuj(Exercise):
 498     def handle_pytanie(self, element):
 499         order_items = element.xpath(".//punkt/@rozw")
 500         return super(Uporzadkuj, self).handle_pytanie(element)
 501
 502
 503 class Przyporzadkuj(Exercise):
 504     def handle_lista(self, lista):
 505         header = etree.Element("parm")
 506         header_cmd = etree.Element("cmd", name="par")
 507         header_cmd.append(header)
 508         if 'nazwa' in lista.attrib:
 509             header.text = u"Kategorie:"
 510         elif 'cel' in lista.attrib:
 511             header.text = u"Elementy do przyporządkowania:"
 512         else:
 513             header.text = u"Lista:"
 514         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 515         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 516         return pre, post
 517
 518
 519 class Luki(Exercise):
 520     def find_pieces(self, question):
 521         return question.xpath(".//luka")
 522
 523     def solution(self, piece):
 524         piece = deepcopy(piece)
 525         piece.tail = None
 526         sub = EduModule()
 527         return sub.generate(piece)
 528
 529     def handle_pytanie(self, element):
 530         qpre, qpost = super(Luki, self).handle_pytanie(element)
 531
 532         luki = self.find_pieces(element)
 533         random.shuffle(luki)
 534         self.words = u"<env name='itemize'>%s</env>" % (
 535             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 536         )
 537         return qpre, qpost
 538
 539     def handle_opis(self, element):
 540         return '', self.words
 541
 542     def handle_luka(self, element):
 543         luka = "_" * 10
 544         if self.options['teacher']:
 545             piece = deepcopy(element)
 546             piece.tail = None
 547             sub = EduModule()
 548             text = sub.generate(piece)
 549             luka += u" [rozwiązanie: %s]" % text
 550         return luka
 551
 552
 553 class Zastap(Luki):
 554     def find_pieces(self, question):
 555         return question.xpath(".//zastap")
 556
 557     def solution(self, piece):
 558         return piece.attrib.get('rozw', '')
 559
 560     def list_header(self):
 561         return u"Elementy do wstawienia"
 562
 563     def handle_zastap(self, element):
 564         piece = deepcopy(element)
 565         piece.tail = None
 566         sub = EduModule()
 567         text = sub.generate(piece)
 568         if self.options['teacher'] and element.attrib.get('rozw'):
 569             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 570         return text
 571
 572
 573 class PrawdaFalsz(Exercise):
 574     def handle_punkt(self, element):
 575         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 576         if 'rozw' in element.attrib:
 577             post += u" [Prawda/Fałsz]"
 578         return pre, post
 579
 580
 581
 582 def fix_lists(tree):
 583     lists = tree.xpath(".//lista")
 584     for l in lists:
 585         if l.text:
 586             p = l.getprevious()
 587             if p is not None:
 588                 if p.tail is None: p.tail = ''
 589                 p.tail += l.text
 590             else:
 591                 p = l.getparent()
 592                 if p.text is None: p.text = ''
 593                 p.text += l.text
 594             l.text = ''
 595     return tree
 596
 597
 598 class EduModulePDFFormat(PDFFormat):
 599     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 600
 601     def get_texml(self):
 602         substitute_hyphens(self.wldoc.edoc)
 603         fix_hanging(self.wldoc.edoc)
 604
 605         self.attachments = {}
 606         edumod = EduModule({
 607             "wldoc": self.wldoc,
 608             "format": self,
 609             "teacher": self.customization.get('teacher'),
 610         })
 611         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 612
 613         open("/tmp/texml.xml", "w").write(texml)
 614         return texml
 615
 616     def get_tex_dir(self):
 617         temp = super(EduModulePDFFormat, self).get_tex_dir()
 618         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 619         for name, iofile in self.attachments.items():
 620             iofile.save_as(os.path.join(temp, name))
 621         return temp
 622
 623     def get_image(self, name):
 624         return self.wldoc.source.attachments[name]
 625