librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, ifoption, tag_open_close
  22 from librarian import DCNS, get_resource, IOFile
  23 from librarian import functions
  24 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  25
  26
  27 def escape(really):
  28     def deco(f):
  29         def _wrap(*args, **kw):
  30             value = f(*args, **kw)
  31
  32             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  33             postfix = u'</TeXML>'
  34             if isinstance(value, list):
  35                 import pdb
  36                 pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         """skip metadata in generation"""
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + (rights_lic and u'<opt>%s</opt>' % rights_lic or '') + \
  97             u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) + \
  98             u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors if author)
 110
 111     @escape(1)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     def handle_utwor(self, element):
 116         lines = [
 117             u'''
 118                 <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 119                 <TeXML escape="0">
 120                 \\documentclass[%s]{wl}
 121                 \\usepackage{style}''' % self.options['customization_str'],
 122             self.options['has_cover'] and '\usepackage{makecover}',
 123             (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 124             (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 125             (self.options['morefloats'] == 'none' and
 126                 u'''\\IfFileExists{morefloats.sty}{
 127                 \\usepackage{morefloats}
 128                 }{}'''),
 129             u'''\\def\\authors{%s}''' % self.get_authors(element),
 130             u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 131             u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 132             u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 133
 134             u'''\\author{\\authors}''',
 135             u'''\\title{%s}''' % self.get_title(element),
 136             u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 137             u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 138             u'</TeXML>'
 139         ]
 140
 141         return u"".join(filter(None, lines)), u'</TeXML>'
 142
 143     @escape(1)
 144     def handle_powiesc(self, element):
 145         return u"""
 146     <env name="document">
 147     <cmd name="maketitle"/>
 148     """, """<cmd name="editorialsection" /></env>"""
 149
 150     @escape(1)
 151     def handle_texcommand(self, element):
 152         cmd = functions.texcommand(element.tag)
 153         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 154
 155     handle_akap = \
 156         handle_akap_cd = \
 157         handle_akap_dialog = \
 158         handle_autor_utworu = \
 159         handle_dedykacja = \
 160         handle_didaskalia = \
 161         handle_didask_tekst = \
 162         handle_dlugi_cytat = \
 163         handle_dzielo_nadrzedne = \
 164         handle_lista_osoba = \
 165         handle_mat = \
 166         handle_miejsce_czas = \
 167         handle_motto = \
 168         handle_motto_podpis = \
 169         handle_naglowek_akt = \
 170         handle_naglowek_czesc = \
 171         handle_naglowek_listy = \
 172         handle_naglowek_osoba = \
 173         handle_naglowek_scena = \
 174         handle_nazwa_utworu = \
 175         handle_nota = \
 176         handle_osoba = \
 177         handle_pa = \
 178         handle_pe = \
 179         handle_podtytul = \
 180         handle_poezja_cyt = \
 181         handle_pr = \
 182         handle_pt = \
 183         handle_sekcja_asterysk = \
 184         handle_sekcja_swiatlo = \
 185         handle_separator_linia = \
 186         handle_slowo_obce = \
 187         handle_srodtytul = \
 188         handle_tytul_dziela = \
 189         handle_wyroznienie = \
 190         handle_dywiz = \
 191         handle_texcommand
 192
 193     def handle_naglowek_rozdzial(self, element):
 194         if not self.options['teacher']:
 195             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek', u'Dla ucznia')):
 196                 self.state['mute'] = False
 197             else:
 198                 self.state['mute'] = True
 199                 return None
 200         return self.handle_texcommand(element)
 201     handle_naglowek_rozdzial.unmuter = True
 202
 203     def handle_naglowek_podrozdzial(self, element):
 204         self.activity_counter = 0
 205         if not self.options['teacher']:
 206             if element.text.startswith(u'Dla ucznia'):
 207                 self.state['mute'] = False
 208                 return None
 209             elif element.text.startswith(u'Dla nauczyciela'):
 210                 self.state['mute'] = True
 211                 return None
 212             elif self.state['mute']:
 213                 return None
 214         return self.handle_texcommand(element)
 215     handle_naglowek_podrozdzial.unmuter = True
 216
 217     def handle_uwaga(self, _e):
 218         return None
 219
 220     def handle_extra(self, _e):
 221         return None
 222
 223     def handle_nbsp(self, _e):
 224         return '<spec cat="tilde" />'
 225
 226     _handle_strofa = cmd("strofa")
 227
 228     def handle_strofa(self, element):
 229         self.options = {'strofa': True}
 230         return self._handle_strofa(element)
 231
 232     def handle_aktywnosc(self, element):
 233         self.activity_counter += 1
 234         self.options = {
 235             'activity': True,
 236             'activity_counter': self.activity_counter,
 237             'sub_gen': True,
 238         }
 239         submill = EduModule(self.options, self.state)
 240
 241         if element.xpath('opis'):
 242             opis = submill.generate(element.xpath('opis')[0])
 243         else:
 244             opis = ''
 245
 246         n = element.xpath('wskazowki')
 247         if n:
 248             wskazowki = submill.generate(n[0])
 249         else:
 250             wskazowki = ''
 251         n = element.xpath('pomoce')
 252
 253         if n:
 254             pomoce = submill.generate(n[0])
 255         else:
 256             pomoce = ''
 257
 258         forma = ''.join(element.xpath('forma/text()'))
 259
 260         czas = ''.join(element.xpath('czas/text()'))
 261
 262         counter = self.activity_counter
 263
 264         if element.getnext().tag == 'aktywnosc' or (len(self.activity_last) and self.activity_last.getnext() == element):
 265             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 266         else:
 267             counter_tex = ''
 268
 269         self.activity_last = element
 270
 271         return u"""
 272 <cmd name="noindent" />
 273 %(counter_tex)s
 274 <cmd name="activityinfo"><parm>
 275  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 276  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 277  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 278 </parm></cmd>
 279
 280
 281 %(opis)s
 282
 283 %(wskazowki)s
 284 """ % locals()
 285
 286     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 287     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 288
 289     @ifoption(sub_gen=True)
 290     def handle_pomoce(self, _):
 291         return "Pomoce: ", ""
 292
 293     def handle_czas(self, *_):
 294         return
 295
 296     def handle_forma(self, *_):
 297         return
 298
 299     def handle_lista(self, element, attrs=None):
 300         ltype = element.attrib.get('typ', 'punkt')
 301         if not element.findall("punkt"):
 302             if ltype == 'czytelnia':
 303                 return 'W przygotowaniu.'
 304             else:
 305                 return None
 306         if ltype == 'slowniczek':
 307             surl = element.attrib.get('src', None)
 308             if surl is None:
 309                 # print '** missing src on <slowniczek>, setting default'
 310                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 311             sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 312             self.options = {'slowniczek': True, 'slowniczek_xml': sxml}
 313
 314         listcmd = {
 315             'num': 'enumerate',
 316             'punkt': 'itemize',
 317             'alfa': 'itemize',
 318             'slowniczek': 'itemize',
 319             'czytelnia': 'itemize'
 320         }[ltype]
 321
 322         return u'<env name="%s">' % listcmd, u'</env>'
 323
 324     def handle_punkt(self, element):
 325         return '<cmd name="item"/>', ''
 326
 327     def handle_cwiczenie(self, element):
 328         exercise_handlers = {
 329             'wybor': Wybor,
 330             'uporzadkuj': Uporzadkuj,
 331             'luki': Luki,
 332             'zastap': Zastap,
 333             'przyporzadkuj': Przyporzadkuj,
 334             'prawdafalsz': PrawdaFalsz
 335         }
 336
 337         typ = element.attrib['typ']
 338         self.exercise_counter += 1
 339         if typ not in exercise_handlers:
 340             return '(no handler)'
 341         self.options = {'exercise_counter': self.exercise_counter}
 342         handler = exercise_handlers[typ](self.options, self.state)
 343         return handler.generate(element)
 344
 345     # XXX this is copied from pyhtml.py, except for return and
 346     # should be refactored for no code duplication
 347     def handle_definiendum(self, element):
 348         nxt = element.getnext()
 349         definiens_s = ''
 350
 351         # let's pull definiens from another document
 352         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 353             sxml = self.options['slowniczek_xml']
 354             assert element.text != ''
 355             if "'" in (element.text or ''):
 356                 defloc = sxml.xpath("//definiendum[text()=\"%s\"]" % (element.text or '').strip())
 357             else:
 358                 defloc = sxml.xpath("//definiendum[text()='%s']" % (element.text or '').strip())
 359             if defloc:
 360                 definiens = defloc[0].getnext()
 361                 if definiens.tag == 'definiens':
 362                     subgen = EduModule(self.options, self.state)
 363                     definiens_s = subgen.generate(definiens)
 364
 365         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 366
 367     def handle_definiens(self, element):
 368         return u"", u""
 369
 370     def handle_podpis(self, element):
 371         return u"""<env name="figure">""", u"</env>"
 372
 373     def handle_tabela(self, element):
 374         max_col = 0
 375         for w in element.xpath("wiersz"):
 376             ks = w.xpath("kol")
 377             if max_col < len(ks):
 378                 max_col = len(ks)
 379         self.options = {'columnts': max_col}
 380         # styling:
 381         #     has_frames = int(element.attrib.get("ramki", "0"))
 382         #     if has_frames: frames_c = "framed"
 383         #     else: frames_c = ""
 384         #     return u"""<table class="%s">""" % frames_c, u"</table>"
 385         return u'''
 386 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 387     ''' % ('l' * max_col), u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 388
 389     @escape(1)
 390     def handle_wiersz(self, element):
 391         return u"", u'<ctrl ch="\\"/>'
 392
 393     @escape(1)
 394     def handle_kol(self, element):
 395         if element.getnext() is not None:
 396             return u"", u'<spec cat="align" />'
 397         return u"", u""
 398
 399     def handle_link(self, element):
 400         if element.attrib.get('url'):
 401             url = element.attrib.get('url')
 402             if url == element.text:
 403                 return cmd('url')(self, element)
 404             else:
 405                 return cmd('href', parms=[element.attrib['url']])(self, element)
 406         else:
 407             return cmd('emph')(self, element)
 408
 409     def handle_obraz(self, element):
 410         frmt = self.options['format']
 411         name = element.attrib.get('nazwa', '').strip()
 412         image = frmt.get_image(name.strip())
 413         name = image.get_filename().rsplit('/', 1)[-1]
 414         img_path = "obraz/%s" % name.replace("_", "")
 415         frmt.attachments[img_path] = image
 416         return cmd("obraz", parms=[img_path])(self)
 417
 418     def handle_video(self, element):
 419         url = element.attrib.get('url')
 420         if not url:
 421             print '!! <video> missing url'
 422             return
 423         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 424         if not m:
 425             print '!! unknown <video> url scheme:', url
 426             return
 427         name = m.group(1)
 428         thumb = IOFile.from_string(urlopen("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 429         img_path = "video/%s.jpg" % name.replace("_", "")
 430         self.options['format'].attachments[img_path] = thumb
 431         canon_url = "https://www.youtube.com/watch?v=%s" % name
 432         return cmd("video", parms=[img_path, canon_url])(self)
 433
 434
 435 class Exercise(EduModule):
 436     def __init__(self, *args, **kw):
 437         self.question_counter = 0
 438         super(Exercise, self).__init__(*args, **kw)
 439         self.piece_counter = None
 440
 441     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 442
 443     def handle_cwiczenie(self, element):
 444         self.options = {
 445             'exercise': element.attrib['typ'],
 446             'sub_gen': True,
 447         }
 448         self.question_counter = 0
 449         self.piece_counter = 0
 450
 451         header = etree.Element("parm")
 452         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 453         header_cmd.append(header)
 454         header.text = u"Zadanie %d." % self.options['exercise_counter']
 455
 456         pre = etree.tostring(header_cmd, encoding=unicode)
 457         post = u""
 458         # Add a single <pytanie> tag if it's not there
 459         if not element.xpath(".//pytanie"):
 460             qpre, qpost = self.handle_pytanie(element)
 461             pre += qpre
 462             post = qpost + post
 463         return pre, post
 464
 465     def handle_pytanie(self, element):
 466         """This will handle <cwiczenie> element, when there is no <pytanie>
 467         """
 468         self.question_counter += 1
 469         self.piece_counter = 0
 470         pre = post = u""
 471         if self.options['teacher'] and element.attrib.get('rozw'):
 472             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 473         return pre, post
 474
 475     def handle_punkt(self, element):
 476         pre, post = super(Exercise, self).handle_punkt(element)
 477         if self.options['teacher'] and element.attrib.get('rozw'):
 478             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 479         return pre, post
 480
 481     def solution_header(self):
 482         par = etree.Element("cmd", name="par")
 483         parm = etree.Element("parm")
 484         parm.text = u"Rozwiązanie:"
 485         par.append(parm)
 486         return etree.tostring(par)
 487
 488     def explicit_solution(self):
 489         if self.options['solution']:
 490             par = etree.Element("cmd", name="par")
 491             parm = etree.Element("parm")
 492             parm.text = self.options['solution']
 493             par.append(parm)
 494             return self.solution_header() + etree.tostring(par)
 495
 496
 497 class Wybor(Exercise):
 498     def handle_cwiczenie(self, element):
 499         pre, post = super(Wybor, self).handle_cwiczenie(element)
 500         is_single_choice = True
 501         pytania = element.xpath(".//pytanie")
 502         if not pytania:
 503             pytania = [element]
 504         for p in pytania:
 505             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 506             if len(solutions) != 1:
 507                 is_single_choice = False
 508                 break
 509             choices = p.xpath(".//*[@nazwa]")
 510             uniq = set()
 511             for n in choices:
 512                 uniq.add(n.attrib.get('nazwa', ''))
 513             if len(choices) != len(uniq):
 514                 is_single_choice = False
 515                 break
 516
 517         self.options = {'single': is_single_choice}
 518         return pre, post
 519
 520     def handle_punkt(self, element):
 521         if self.options['exercise'] and element.attrib.get('nazwa', None):
 522             cmd = 'radio' if self.options['single'] else 'checkbox'
 523             return u'<cmd name="%s"/>' % cmd, ''
 524         else:
 525             return super(Wybor, self).handle_punkt(element)
 526
 527
 528 class Uporzadkuj(Exercise):
 529     def handle_pytanie(self, element):
 530         order_items = element.xpath(".//punkt/@rozw")
 531         return super(Uporzadkuj, self).handle_pytanie(element)
 532
 533
 534 class Przyporzadkuj(Exercise):
 535     def handle_lista(self, lista):
 536         header = etree.Element("parm")
 537         header_cmd = etree.Element("cmd", name="par")
 538         header_cmd.append(header)
 539         if 'nazwa' in lista.attrib:
 540             header.text = u"Kategorie:"
 541         elif 'cel' in lista.attrib:
 542             header.text = u"Elementy do przyporządkowania:"
 543         else:
 544             header.text = u"Lista:"
 545         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 546         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 547         return pre, post
 548
 549
 550 class Luki(Exercise):
 551     def find_pieces(self, question):
 552         return question.xpath(".//luka")
 553
 554     def solution(self, piece):
 555         piece = deepcopy(piece)
 556         piece.tail = None
 557         sub = EduModule()
 558         return sub.generate(piece)
 559
 560     def handle_pytanie(self, element):
 561         qpre, qpost = super(Luki, self).handle_pytanie(element)
 562
 563         luki = self.find_pieces(element)
 564         random.shuffle(luki)
 565         self.words = u"<env name='itemize'>%s</env>" % (
 566             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 567         )
 568         return qpre, qpost
 569
 570     def handle_opis(self, element):
 571         return '', self.words
 572
 573     def handle_luka(self, element):
 574         luka = "_" * 10
 575         if self.options['teacher']:
 576             piece = deepcopy(element)
 577             piece.tail = None
 578             sub = EduModule()
 579             text = sub.generate(piece)
 580             luka += u" [rozwiązanie: %s]" % text
 581         return luka
 582
 583
 584 class Zastap(Luki):
 585     def find_pieces(self, question):
 586         return question.xpath(".//zastap")
 587
 588     def solution(self, piece):
 589         return piece.attrib.get('rozw', '')
 590
 591     def list_header(self):
 592         return u"Elementy do wstawienia"
 593
 594     def handle_zastap(self, element):
 595         piece = deepcopy(element)
 596         piece.tail = None
 597         sub = EduModule()
 598         text = sub.generate(piece)
 599         if self.options['teacher'] and element.attrib.get('rozw'):
 600             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 601         return text
 602
 603
 604 class PrawdaFalsz(Exercise):
 605     def handle_punkt(self, element):
 606         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 607         if 'rozw' in element.attrib:
 608             post += u" [Prawda/Fałsz]"
 609         return pre, post
 610
 611
 612 def fix_lists(tree):
 613     lists = tree.xpath(".//lista")
 614     for l in lists:
 615         if l.text:
 616             p = l.getprevious()
 617             if p is not None:
 618                 if p.tail is None:
 619                     p.tail = ''
 620                 p.tail += l.text
 621             else:
 622                 p = l.getparent()
 623                 if p.text is None:
 624                     p.text = ''
 625                 p.text += l.text
 626             l.text = ''
 627     return tree
 628
 629
 630 class EduModulePDFFormat(PDFFormat):
 631     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 632
 633     def get_texml(self):
 634         substitute_hyphens(self.wldoc.edoc)
 635         fix_hanging(self.wldoc.edoc)
 636
 637         self.attachments = {}
 638         edumod = EduModule({
 639             "wldoc": self.wldoc,
 640             "format": self,
 641             "teacher": self.customization.get('teacher'),
 642         })
 643         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 644
 645         open("/tmp/texml.xml", "w").write(texml)
 646         return texml
 647
 648     def get_tex_dir(self):
 649         temp = super(EduModulePDFFormat, self).get_tex_dir()
 650         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 651         for name, iofile in self.attachments.items():
 652             iofile.save_as(os.path.join(temp, name))
 653         return temp
 654
 655     def get_image(self, name):
 656         return self.wldoc.source.attachments[name]