librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, ifoption, tag_open_close
  22 from librarian import DCNS, get_resource, IOFile
  23 from librarian import functions
  24 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  25
  26
  27 def escape(really):
  28     def deco(f):
  29         def _wrap(*args, **kw):
  30             value = f(*args, **kw)
  31
  32             prefix = (u'<TeXML escape="%d">' % (1 if really else 0))
  33             postfix = u'</TeXML>'
  34             if isinstance(value, list):
  35                 import pdb
  36                 pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         """skip metadata in generation"""
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + (rights_lic and u'<opt>%s</opt>' % rights_lic or '') + \
  97             u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) + \
  98             u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors if author)
 110
 111     @escape(True)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     @escape(True)
 116     def get_description(self, element):
 117         return self.get_dc(element, 'description', single=True)
 118
 119     def handle_utwor(self, element):
 120         lines = [
 121             u'''
 122                 <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 123                 <TeXML escape="0">
 124                 \\documentclass[%s]{wl}
 125                 \\usepackage{style}''' % self.options['customization_str'],
 126             self.options['has_cover'] and '\usepackage{makecover}',
 127             (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 128             (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 129             (self.options['morefloats'] == 'none' and
 130                 u'''\\IfFileExists{morefloats.sty}{
 131                 \\usepackage{morefloats}
 132                 }{}'''),
 133             u'''\\def\\authors{%s}''' % self.get_authors(element),
 134             u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 135             u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 136             u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 137             u'''\\def\\description{%s}''' % self.get_description(element),
 138
 139             u'''\\author{\\authors}''',
 140             u'''\\title{%s}''' % self.get_title(element),
 141             u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 142             u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 143             u'</TeXML>'
 144         ]
 145
 146         return u"".join(filter(None, lines)), u'</TeXML>'
 147
 148     @escape(True)
 149     def handle_powiesc(self, element):
 150         return u"""
 151     <env name="document">
 152     <cmd name="maketitle"/>
 153     """, """<cmd name="editorialsection" /></env>"""
 154
 155     @escape(True)
 156     def handle_texcommand(self, element):
 157         cmd = functions.texcommand(element.tag)
 158         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 159
 160     handle_akap = \
 161         handle_akap_cd = \
 162         handle_akap_dialog = \
 163         handle_autor_utworu = \
 164         handle_dedykacja = \
 165         handle_didaskalia = \
 166         handle_didask_tekst = \
 167         handle_dlugi_cytat = \
 168         handle_dzielo_nadrzedne = \
 169         handle_lista_osoba = \
 170         handle_mat = \
 171         handle_miejsce_czas = \
 172         handle_motto = \
 173         handle_motto_podpis = \
 174         handle_naglowek_akt = \
 175         handle_naglowek_czesc = \
 176         handle_naglowek_listy = \
 177         handle_naglowek_osoba = \
 178         handle_naglowek_scena = \
 179         handle_nazwa_utworu = \
 180         handle_nota = \
 181         handle_osoba = \
 182         handle_pa = \
 183         handle_pe = \
 184         handle_podtytul = \
 185         handle_poezja_cyt = \
 186         handle_pr = \
 187         handle_pt = \
 188         handle_sekcja_asterysk = \
 189         handle_sekcja_swiatlo = \
 190         handle_separator_linia = \
 191         handle_slowo_obce = \
 192         handle_srodtytul = \
 193         handle_tytul_dziela = \
 194         handle_wyroznienie = \
 195         handle_dywiz = \
 196         handle_texcommand
 197
 198     def handle_naglowek_rozdzial(self, element):
 199         if not self.options['teacher']:
 200             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')):
 201                 self.state['mute'] = False
 202             else:
 203                 self.state['mute'] = True
 204                 return None
 205         return self.handle_texcommand(element)
 206     handle_naglowek_rozdzial.unmuter = True
 207
 208     def handle_naglowek_podrozdzial(self, element):
 209         self.activity_counter = 0
 210         if not self.options['teacher']:
 211             if element.text.startswith(u'Dla ucznia'):
 212                 self.state['mute'] = False
 213                 return None
 214             elif element.text.startswith(u'Dla nauczyciela'):
 215                 self.state['mute'] = True
 216                 return None
 217             elif self.state['mute']:
 218                 return None
 219         return self.handle_texcommand(element)
 220     handle_naglowek_podrozdzial.unmuter = True
 221
 222     def handle_uwaga(self, _e):
 223         return None
 224
 225     def handle_extra(self, _e):
 226         return None
 227
 228     def handle_nbsp(self, _e):
 229         return '<spec cat="tilde" />'
 230
 231     _handle_strofa = cmd("strofa")
 232
 233     def handle_strofa(self, element):
 234         self.options = {'strofa': True}
 235         return self._handle_strofa(element)
 236
 237     def handle_aktywnosc(self, element):
 238         self.activity_counter += 1
 239         self.options = {
 240             'activity': True,
 241             'activity_counter': self.activity_counter,
 242             'sub_gen': True,
 243         }
 244         submill = EduModule(self.options, self.state)
 245
 246         if element.xpath('opis'):
 247             opis = submill.generate(element.xpath('opis')[0])
 248         else:
 249             opis = ''
 250
 251         n = element.xpath('wskazowki')
 252         if n:
 253             wskazowki = submill.generate(n[0])
 254         else:
 255             wskazowki = ''
 256         n = element.xpath('pomoce')
 257
 258         if n:
 259             pomoce = submill.generate(n[0])
 260         else:
 261             pomoce = ''
 262
 263         forma = ''.join(element.xpath('forma/text()'))
 264
 265         czas = ''.join(element.xpath('czas/text()'))
 266
 267         counter = self.activity_counter
 268
 269         if element.getnext().tag == 'aktywnosc' or (len(self.activity_last) and self.activity_last.getnext() == element):
 270             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 271         else:
 272             counter_tex = ''
 273
 274         self.activity_last = element
 275
 276         return u"""
 277 <cmd name="noindent" />
 278 %(counter_tex)s
 279 <cmd name="activityinfo"><parm>
 280  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 281  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 282  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 283 </parm></cmd>
 284
 285
 286 %(opis)s
 287
 288 %(wskazowki)s
 289 """ % locals()
 290
 291     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 292     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 293
 294     @ifoption(sub_gen=True)
 295     def handle_pomoce(self, _):
 296         return "Pomoce: ", ""
 297
 298     def handle_czas(self, *_):
 299         return
 300
 301     def handle_forma(self, *_):
 302         return
 303
 304     def handle_lista(self, element, attrs=None):
 305         ltype = element.attrib.get('typ', 'punkt')
 306         if not element.findall("punkt"):
 307             if ltype == 'czytelnia':
 308                 return 'W przygotowaniu.'
 309             else:
 310                 return None
 311         if ltype == 'slowniczek':
 312             surl = element.attrib.get('src', None)
 313             if surl is None:
 314                 # print '** missing src on <slowniczek>, setting default'
 315                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 316             sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 317             self.options = {'slowniczek': True, 'slowniczek_xml': sxml}
 318
 319         listcmd = {
 320             'num': 'enumerate',
 321             'punkt': 'itemize',
 322             'alfa': 'itemize',
 323             'slowniczek': 'itemize',
 324             'czytelnia': 'itemize'
 325         }[ltype]
 326
 327         return u'<env name="%s">' % listcmd, u'</env>'
 328
 329     def handle_punkt(self, element):
 330         return '<cmd name="item"/>', ''
 331
 332     def handle_cwiczenie(self, element):
 333         exercise_handlers = {
 334             'wybor': Wybor,
 335             'uporzadkuj': Uporzadkuj,
 336             'luki': Luki,
 337             'zastap': Zastap,
 338             'przyporzadkuj': Przyporzadkuj,
 339             'prawdafalsz': PrawdaFalsz
 340         }
 341
 342         typ = element.attrib['typ']
 343         self.exercise_counter += 1
 344         if typ not in exercise_handlers:
 345             return '(no handler)'
 346         self.options = {'exercise_counter': self.exercise_counter}
 347         handler = exercise_handlers[typ](self.options, self.state)
 348         return handler.generate(element)
 349
 350     # XXX this is copied from pyhtml.py, except for return and
 351     # should be refactored for no code duplication
 352     def handle_definiendum(self, element):
 353         nxt = element.getnext()
 354         definiens_s = ''
 355
 356         # let's pull definiens from another document
 357         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 358             sxml = self.options['slowniczek_xml']
 359             assert element.text != ''
 360             if "'" in (element.text or ''):
 361                 defloc = sxml.xpath("//definiendum[text()=\"%s\"]" % (element.text or '').strip())
 362             else:
 363                 defloc = sxml.xpath("//definiendum[text()='%s']" % (element.text or '').strip())
 364             if defloc:
 365                 definiens = defloc[0].getnext()
 366                 if definiens.tag == 'definiens':
 367                     subgen = EduModule(self.options, self.state)
 368                     definiens_s = subgen.generate(definiens)
 369
 370         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 371
 372     def handle_definiens(self, element):
 373         return u"", u""
 374
 375     def handle_podpis(self, element):
 376         return u"""<env name="figure">""", u"</env>"
 377
 378     def handle_tabela(self, element):
 379         max_col = 0
 380         for w in element.xpath("wiersz"):
 381             ks = w.xpath("kol")
 382             if max_col < len(ks):
 383                 max_col = len(ks)
 384         self.options = {'columnts': max_col}
 385         # styling:
 386         #     has_frames = int(element.attrib.get("ramki", "0"))
 387         #     if has_frames: frames_c = "framed"
 388         #     else: frames_c = ""
 389         #     return u"""<table class="%s">""" % frames_c, u"</table>"
 390         return u'''
 391 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 392     ''' % ('l' * max_col), u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 393
 394     @escape(True)
 395     def handle_wiersz(self, element):
 396         return u"", u'<ctrl ch="\\"/>'
 397
 398     @escape(True)
 399     def handle_kol(self, element):
 400         if element.getnext() is not None:
 401             return u"", u'<spec cat="align" />'
 402         return u"", u""
 403
 404     def handle_link(self, element):
 405         if element.attrib.get('url'):
 406             url = element.attrib.get('url')
 407             if url == element.text:
 408                 return cmd('url')(self, element)
 409             else:
 410                 return cmd('href', parms=[element.attrib['url']])(self, element)
 411         else:
 412             return cmd('emph')(self, element)
 413
 414     def handle_obraz(self, element):
 415         frmt = self.options['format']
 416         name = element.attrib.get('nazwa', '').strip()
 417         image = frmt.get_image(name.strip())
 418         name = image.get_filename().rsplit('/', 1)[-1]
 419         img_path = "obraz/%s" % name.replace("_", "")
 420         frmt.attachments[img_path] = image
 421         return cmd("obraz", parms=[img_path])(self)
 422
 423     def handle_video(self, element):
 424         url = element.attrib.get('url')
 425         if not url:
 426             print '!! <video> missing url'
 427             return
 428         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 429         if not m:
 430             print '!! unknown <video> url scheme:', url
 431             return
 432         name = m.group(1)
 433         thumb = IOFile.from_string(urlopen("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 434         img_path = "video/%s.jpg" % name.replace("_", "")
 435         self.options['format'].attachments[img_path] = thumb
 436         canon_url = "https://www.youtube.com/watch?v=%s" % name
 437         return cmd("video", parms=[img_path, canon_url])(self)
 438
 439
 440 class Exercise(EduModule):
 441     def __init__(self, *args, **kw):
 442         self.question_counter = 0
 443         super(Exercise, self).__init__(*args, **kw)
 444         self.piece_counter = None
 445
 446     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 447
 448     def handle_cwiczenie(self, element):
 449         self.options = {
 450             'exercise': element.attrib['typ'],
 451             'sub_gen': True,
 452         }
 453         self.question_counter = 0
 454         self.piece_counter = 0
 455
 456         header = etree.Element("parm")
 457         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 458         header_cmd.append(header)
 459         header.text = u"Zadanie %d." % self.options['exercise_counter']
 460
 461         pre = etree.tostring(header_cmd, encoding=unicode)
 462         post = u""
 463         # Add a single <pytanie> tag if it's not there
 464         if not element.xpath(".//pytanie"):
 465             qpre, qpost = self.handle_pytanie(element)
 466             pre += qpre
 467             post = qpost + post
 468         return pre, post
 469
 470     def handle_pytanie(self, element):
 471         """This will handle <cwiczenie> element, when there is no <pytanie>
 472         """
 473         self.question_counter += 1
 474         self.piece_counter = 0
 475         pre = post = u""
 476         if self.options['teacher'] and element.attrib.get('rozw'):
 477             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 478         return pre, post
 479
 480     def handle_punkt(self, element):
 481         pre, post = super(Exercise, self).handle_punkt(element)
 482         if self.options['teacher'] and element.attrib.get('rozw'):
 483             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 484         return pre, post
 485
 486     def solution_header(self):
 487         par = etree.Element("cmd", name="par")
 488         parm = etree.Element("parm")
 489         parm.text = u"Rozwiązanie:"
 490         par.append(parm)
 491         return etree.tostring(par)
 492
 493     def explicit_solution(self):
 494         if self.options['solution']:
 495             par = etree.Element("cmd", name="par")
 496             parm = etree.Element("parm")
 497             parm.text = self.options['solution']
 498             par.append(parm)
 499             return self.solution_header() + etree.tostring(par)
 500
 501
 502 class Wybor(Exercise):
 503     def handle_cwiczenie(self, element):
 504         pre, post = super(Wybor, self).handle_cwiczenie(element)
 505         is_single_choice = True
 506         pytania = element.xpath(".//pytanie")
 507         if not pytania:
 508             pytania = [element]
 509         for p in pytania:
 510             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 511             if len(solutions) != 1:
 512                 is_single_choice = False
 513                 break
 514             choices = p.xpath(".//*[@nazwa]")
 515             uniq = set()
 516             for n in choices:
 517                 uniq.add(n.attrib.get('nazwa', ''))
 518             if len(choices) != len(uniq):
 519                 is_single_choice = False
 520                 break
 521
 522         self.options = {'single': is_single_choice}
 523         return pre, post
 524
 525     def handle_punkt(self, element):
 526         if self.options['exercise'] and element.attrib.get('nazwa', None):
 527             cmd = 'radio' if self.options['single'] else 'checkbox'
 528             return u'<cmd name="%s"/>' % cmd, ''
 529         else:
 530             return super(Wybor, self).handle_punkt(element)
 531
 532
 533 class Uporzadkuj(Exercise):
 534     def handle_pytanie(self, element):
 535         order_items = element.xpath(".//punkt/@rozw")
 536         return super(Uporzadkuj, self).handle_pytanie(element)
 537
 538
 539 class Przyporzadkuj(Exercise):
 540     def handle_lista(self, lista):
 541         header = etree.Element("parm")
 542         header_cmd = etree.Element("cmd", name="par")
 543         header_cmd.append(header)
 544         if 'nazwa' in lista.attrib:
 545             header.text = u"Kategorie:"
 546         elif 'cel' in lista.attrib:
 547             header.text = u"Elementy do przyporządkowania:"
 548         else:
 549             header.text = u"Lista:"
 550         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 551         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 552         return pre, post
 553
 554
 555 class Luki(Exercise):
 556     def find_pieces(self, question):
 557         return question.xpath(".//luka")
 558
 559     def solution(self, piece):
 560         piece = deepcopy(piece)
 561         piece.tail = None
 562         sub = EduModule()
 563         return sub.generate(piece)
 564
 565     def handle_pytanie(self, element):
 566         qpre, qpost = super(Luki, self).handle_pytanie(element)
 567
 568         luki = self.find_pieces(element)
 569         random.shuffle(luki)
 570         self.words = u"<env name='itemize'>%s</env>" % (
 571             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 572         )
 573         return qpre, qpost
 574
 575     def handle_opis(self, element):
 576         return '', self.words
 577
 578     def handle_luka(self, element):
 579         luka = "_" * 10
 580         if self.options['teacher']:
 581             piece = deepcopy(element)
 582             piece.tail = None
 583             sub = EduModule()
 584             text = sub.generate(piece)
 585             luka += u" [rozwiązanie: %s]" % text
 586         return luka
 587
 588
 589 class Zastap(Luki):
 590     def find_pieces(self, question):
 591         return question.xpath(".//zastap")
 592
 593     def solution(self, piece):
 594         return piece.attrib.get('rozw', '')
 595
 596     def list_header(self):
 597         return u"Elementy do wstawienia"
 598
 599     def handle_zastap(self, element):
 600         piece = deepcopy(element)
 601         piece.tail = None
 602         sub = EduModule()
 603         text = sub.generate(piece)
 604         if self.options['teacher'] and element.attrib.get('rozw'):
 605             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 606         return text
 607
 608
 609 class PrawdaFalsz(Exercise):
 610     def handle_punkt(self, element):
 611         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 612         if 'rozw' in element.attrib:
 613             post += u" [Prawda/Fałsz]"
 614         return pre, post
 615
 616
 617 def fix_lists(tree):
 618     lists = tree.xpath(".//lista")
 619     for l in lists:
 620         if l.text:
 621             p = l.getprevious()
 622             if p is not None:
 623                 if p.tail is None:
 624                     p.tail = ''
 625                 p.tail += l.text
 626             else:
 627                 p = l.getparent()
 628                 if p.text is None:
 629                     p.text = ''
 630                 p.text += l.text
 631             l.text = ''
 632     return tree
 633
 634
 635 class EduModulePDFFormat(PDFFormat):
 636     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 637
 638     def get_texml(self):
 639         substitute_hyphens(self.wldoc.edoc)
 640         fix_hanging(self.wldoc.edoc)
 641
 642         self.attachments = {}
 643         edumod = EduModule({
 644             "wldoc": self.wldoc,
 645             "format": self,
 646             "teacher": self.customization.get('teacher'),
 647         })
 648         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 649
 650         open("/tmp/texml.xml", "w").write(texml)
 651         return texml
 652
 653     def get_tex_dir(self):
 654         temp = super(EduModulePDFFormat, self).get_tex_dir()
 655         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 656         for name, iofile in self.attachments.items():
 657             iofile.save_as(os.path.join(temp, name))
 658         return temp
 659
 660     def get_image(self, name):
 661         return self.wldoc.source.attachments[name]