librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.activity_last = None
  73         self.exercise_counter = 0
  74
  75         def swap_endlines(txt):
  76             if self.options['strofa']:
  77                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  78             return txt
  79         self.register_text_filter(swap_endlines)
  80         self.register_text_filter(functions.substitute_entities)
  81         self.register_text_filter(mark_alien_characters)
  82
  83     def get_dc(self, element, dc_field, single=False):
  84         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  85         if single:
  86             return values[0]
  87         return values
  88
  89     def handle_rdf__RDF(self, _):
  90         "skip metadata in generation"
  91         return
  92
  93     @escape(True)
  94     def get_rightsinfo(self, element):
  95         rights_lic = self.get_dc(element, 'rights.license', True)
  96         return u'<cmd name="rightsinfostr">' + \
  97           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  98           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  99           u'</cmd>'
 100
 101     @escape(True)
 102     def get_authors(self, element, which=None):
 103         dc = self.options['wldoc'].book_info
 104         if which is None:
 105             authors = dc.authors_textbook + \
 106                 dc.authors_scenario + \
 107                 dc.authors_expert
 108         else:
 109             authors = getattr(dc, "authors_%s" % which)
 110         return u', '.join(author.readable() for author in authors)
 111
 112     @escape(1)
 113     def get_title(self, element):
 114         return self.get_dc(element, 'title', True)
 115
 116     def handle_utwor(self, element):
 117         lines = [
 118             u'''
 119     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 120         <TeXML escape="0">
 121         \\documentclass[%s]{wl}
 122         \\usepackage{style}''' % self.options['customization_str'],
 123     self.options['has_cover'] and '\usepackage{makecover}',
 124     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 125     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 126     (self.options['morefloats'] == 'none' and
 127      u'''\\IfFileExists{morefloats.sty}{
 128             \\usepackage{morefloats}
 129         }{}'''),
 130     u'''\\def\\authors{%s}''' % self.get_authors(element),
 131     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 132     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 133     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 134
 135     u'''\\author{\\authors}''',
 136     u'''\\title{%s}''' % self.get_title(element),
 137     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 138     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 139     u'</TeXML>']
 140
 141         return u"".join(filter(None, lines)), u'</TeXML>'
 142
 143
 144     @escape(1)
 145     def handle_powiesc(self, element):
 146         return u"""
 147     <env name="document">
 148     <cmd name="maketitle"/>
 149     """, """<cmd name="editorialsection" /></env>"""
 150
 151     @escape(1)
 152     def handle_texcommand(self, element):
 153         cmd = functions.texcommand(element.tag)
 154         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 155
 156     handle_akap = \
 157     handle_akap = \
 158     handle_akap_cd = \
 159     handle_akap_cd = \
 160     handle_akap_dialog = \
 161     handle_akap_dialog = \
 162     handle_autor_utworu = \
 163     handle_dedykacja = \
 164     handle_didaskalia = \
 165     handle_didask_tekst = \
 166     handle_dlugi_cytat = \
 167     handle_dzielo_nadrzedne = \
 168     handle_lista_osoba = \
 169     handle_mat = \
 170     handle_miejsce_czas = \
 171     handle_motto = \
 172     handle_motto_podpis = \
 173     handle_naglowek_akt = \
 174     handle_naglowek_czesc = \
 175     handle_naglowek_listy = \
 176     handle_naglowek_osoba = \
 177     handle_naglowek_podrozdzial = \
 178     handle_naglowek_scena = \
 179     handle_nazwa_utworu = \
 180     handle_nota = \
 181     handle_osoba = \
 182     handle_pa = \
 183     handle_pe = \
 184     handle_podtytul = \
 185     handle_poezja_cyt = \
 186     handle_pr = \
 187     handle_pt = \
 188     handle_sekcja_asterysk = \
 189     handle_sekcja_swiatlo = \
 190     handle_separator_linia = \
 191     handle_slowo_obce = \
 192     handle_srodtytul = \
 193     handle_tytul_dziela = \
 194     handle_wyroznienie = \
 195     handle_dywiz = \
 196     handle_texcommand
 197
 198     def handle_naglowek_rozdzial(self, element):
 199         if not self.options['teacher']:
 200             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek')):
 201                 self.state['mute'] = False
 202             else:
 203                 self.state['mute'] = True
 204                 return None
 205         return self.handle_texcommand(element)
 206     handle_naglowek_rozdzial.unmuter = True
 207
 208
 209     def handle_uwaga(self, _e):
 210         return None
 211     def handle_extra(self, _e):
 212         return None
 213
 214     def handle_nbsp(self, _e):
 215         return '<spec cat="tilde" />'
 216
 217     _handle_strofa = cmd("strofa")
 218
 219     def handle_strofa(self, element):
 220         self.options = {'strofa': True}
 221         return self._handle_strofa(element)
 222
 223     def handle_aktywnosc(self, element):
 224         self.activity_counter += 1
 225         self.options = {
 226             'activity': True,
 227             'activity_counter': self.activity_counter,
 228             'sub_gen': True,
 229         }
 230         submill = EduModule(self.options, self.state)
 231
 232         if element.xpath('opis'):
 233             opis = submill.generate(element.xpath('opis')[0])
 234         else:
 235             opis = ''
 236
 237         n = element.xpath('wskazowki')
 238         if n: wskazowki = submill.generate(n[0])
 239
 240         else: wskazowki = ''
 241         n = element.xpath('pomoce')
 242
 243         if n: pomoce = submill.generate(n[0])
 244         else: pomoce = ''
 245
 246         forma = ''.join(element.xpath('forma/text()'))
 247
 248         czas = ''.join(element.xpath('czas/text()'))
 249
 250         counter = self.activity_counter
 251
 252         if element.getnext().tag == 'aktywnosc' or self.activity_last.getnext() == element:
 253             counter_tex = """<cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>""" % locals()
 254         else:
 255             counter_tex = ''
 256
 257         self.activity_last = element
 258
 259         return u"""
 260 <cmd name="noindent" />
 261 %(counter_tex)s
 262 <cmd name="activityinfo"><parm>
 263  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 264  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 265  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 266 </parm></cmd>
 267
 268
 269 %(opis)s
 270
 271 %(wskazowki)s
 272 """ % locals()
 273
 274     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 275     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 276
 277     @ifoption(sub_gen=True)
 278     def handle_pomoce(self, _):
 279         return "Pomoce: ", ""
 280
 281     def handle_czas(self, *_):
 282         return
 283
 284     def handle_forma(self, *_):
 285         return
 286
 287     def handle_lista(self, element, attrs={}):
 288         ltype = element.attrib.get('typ', 'punkt')
 289         if not element.findall("punkt"):
 290             if ltype == 'czytelnia':
 291                 return 'W przygotowaniu.'
 292             else:
 293                 return None
 294         if ltype == 'slowniczek':
 295             surl = element.attrib.get('src', None)
 296             if surl is None:
 297                 # print '** missing src on <slowniczek>, setting default'
 298                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 299             sxml = None
 300             if surl:
 301                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 302             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 303
 304         listcmd = {'num': 'enumerate',
 305                'punkt': 'itemize',
 306                'alfa': 'itemize',
 307                'slowniczek': 'itemize',
 308                'czytelnia': 'itemize'}[ltype]
 309
 310         return u'<env name="%s">' % listcmd, u'</env>'
 311
 312     def handle_punkt(self, element):
 313         return '<cmd name="item"/>', ''
 314
 315     def handle_cwiczenie(self, element):
 316         exercise_handlers = {
 317             'wybor': Wybor,
 318             'uporzadkuj': Uporzadkuj,
 319             'luki': Luki,
 320             'zastap': Zastap,
 321             'przyporzadkuj': Przyporzadkuj,
 322             'prawdafalsz': PrawdaFalsz
 323         }
 324
 325         typ = element.attrib['typ']
 326         self.exercise_counter += 1
 327         if not typ in exercise_handlers:
 328             return '(no handler)'
 329         self.options = {'exercise_counter': self.exercise_counter}
 330         handler = exercise_handlers[typ](self.options, self.state)
 331         return handler.generate(element)
 332
 333     # XXX this is copied from pyhtml.py, except for return and
 334     # should be refactored for no code duplication
 335     def handle_definiendum(self, element):
 336         nxt = element.getnext()
 337         definiens_s = ''
 338
 339         # let's pull definiens from another document
 340         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 341             sxml = self.options['slowniczek_xml']
 342             assert element.text != ''
 343             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 344             if defloc:
 345                 definiens = defloc[0].getnext()
 346                 if definiens.tag == 'definiens':
 347                     subgen = EduModule(self.options, self.state)
 348                     definiens_s = subgen.generate(definiens)
 349
 350         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 351
 352     def handle_definiens(self, element):
 353         return u"", u""
 354
 355     def handle_podpis(self, element):
 356         return u"""<env name="figure">""", u"</env>"
 357
 358     def handle_tabela(self, element):
 359         max_col = 0
 360         for w in element.xpath("wiersz"):
 361             ks = w.xpath("kol")
 362             if max_col < len(ks):
 363                 max_col = len(ks)
 364         self.options = {'columnts': max_col}
 365         # styling:
 366                 #        has_frames = int(element.attrib.get("ramki", "0"))
 367                 #        if has_frames: frames_c = "framed"
 368                 #        else: frames_c = ""
 369                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 370         return u'''
 371 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 372     ''' % ('l' * max_col), \
 373     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 374
 375     @escape(1)
 376     def handle_wiersz(self, element):
 377         return u"", u'<ctrl ch="\\"/>'
 378
 379     @escape(1)
 380     def handle_kol(self, element):
 381         if element.getnext() is not None:
 382             return u"", u'<spec cat="align" />'
 383         return u"", u""
 384
 385     def handle_link(self, element):
 386         if element.attrib.get('url'):
 387             url = element.attrib.get('url')
 388             if url == element.text:
 389                 return cmd('url')(self, element)
 390             else:
 391                 return cmd('href', parms=[element.attrib['url']])(self, element)
 392         else:
 393             return cmd('emph')(self, element)
 394
 395     def handle_obraz(self, element):
 396         frmt = self.options['format']
 397         name = element.attrib.get('nazwa', '').strip()
 398         image = frmt.get_image(name.strip())
 399         img_path = "obraz/%s" % name.replace("_", "")
 400         frmt.attachments[img_path] = image
 401         return cmd("obraz", parms=[img_path])(self)
 402
 403     def handle_video(self, element):
 404         url = element.attrib.get('url')
 405         if not url:
 406             print '!! <video> missing url'
 407             return
 408         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 409         if not m:
 410             print '!! unknown <video> url scheme:', url
 411             return
 412         name = m.group(1)
 413         thumb = IOFile.from_string(urlopen
 414             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 415         img_path = "video/%s.jpg" % name.replace("_", "")
 416         self.options['format'].attachments[img_path] = thumb
 417         canon_url = "https://www.youtube.com/watch?v=%s" % name
 418         return cmd("video", parms=[img_path, canon_url])(self)
 419
 420
 421 class Exercise(EduModule):
 422     def __init__(self, *args, **kw):
 423         self.question_counter = 0
 424         super(Exercise, self).__init__(*args, **kw)
 425
 426     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 427
 428     def handle_cwiczenie(self, element):
 429         self.options = {
 430             'exercise': element.attrib['typ'],
 431             'sub_gen': True,
 432         }
 433         self.question_counter = 0
 434         self.piece_counter = 0
 435
 436         header = etree.Element("parm")
 437         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 438         header_cmd.append(header)
 439         header.text = u"Zadanie %d." % self.options['exercise_counter']
 440
 441         pre = etree.tostring(header_cmd, encoding=unicode)
 442         post = u""
 443         # Add a single <pytanie> tag if it's not there
 444         if not element.xpath(".//pytanie"):
 445             qpre, qpost = self.handle_pytanie(element)
 446             pre = pre + qpre
 447             post = qpost + post
 448         return pre, post
 449
 450     def handle_pytanie(self, element):
 451         """This will handle <cwiczenie> element, when there is no <pytanie>
 452         """
 453         self.question_counter += 1
 454         self.piece_counter = 0
 455         pre = post = u""
 456         if self.options['teacher'] and element.attrib.get('rozw'):
 457             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 458         return pre, post
 459
 460     def handle_punkt(self, element):
 461         pre, post = super(Exercise, self).handle_punkt(element)
 462         if self.options['teacher'] and element.attrib.get('rozw'):
 463             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 464         return pre, post
 465
 466     def solution_header(self):
 467         par = etree.Element("cmd", name="par")
 468         parm = etree.Element("parm")
 469         parm.text = u"Rozwiązanie:"
 470         par.append(parm)
 471         return etree.tostring(par)
 472
 473     def explicit_solution(self):
 474         if self.options['solution']:
 475             par = etree.Element("cmd", name="par")
 476             parm = etree.Element("parm")
 477             parm.text = self.options['solution']
 478             par.append(parm)
 479             return self.solution_header() + etree.tostring(par)
 480
 481
 482
 483 class Wybor(Exercise):
 484     def handle_cwiczenie(self, element):
 485         pre, post = super(Wybor, self).handle_cwiczenie(element)
 486         is_single_choice = True
 487         pytania = element.xpath(".//pytanie")
 488         if not pytania:
 489             pytania = [element]
 490         for p in pytania:
 491             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 492             if len(solutions) != 1:
 493                 is_single_choice = False
 494                 break
 495             choices = p.xpath(".//*[@nazwa]")
 496             uniq = set()
 497             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 498             if len(choices) != len(uniq):
 499                 is_single_choice = False
 500                 break
 501
 502         self.options = {'single': is_single_choice}
 503         return pre, post
 504
 505     def handle_punkt(self, element):
 506         if self.options['exercise'] and element.attrib.get('nazwa', None):
 507             cmd = 'radio' if self.options['single'] else 'checkbox'
 508             return u'<cmd name="%s"/>' % cmd, ''
 509         else:
 510             return super(Wybor, self).handle_punkt(element)
 511
 512
 513 class Uporzadkuj(Exercise):
 514     def handle_pytanie(self, element):
 515         order_items = element.xpath(".//punkt/@rozw")
 516         return super(Uporzadkuj, self).handle_pytanie(element)
 517
 518
 519 class Przyporzadkuj(Exercise):
 520     def handle_lista(self, lista):
 521         header = etree.Element("parm")
 522         header_cmd = etree.Element("cmd", name="par")
 523         header_cmd.append(header)
 524         if 'nazwa' in lista.attrib:
 525             header.text = u"Kategorie:"
 526         elif 'cel' in lista.attrib:
 527             header.text = u"Elementy do przyporządkowania:"
 528         else:
 529             header.text = u"Lista:"
 530         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 531         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 532         return pre, post
 533
 534
 535 class Luki(Exercise):
 536     def find_pieces(self, question):
 537         return question.xpath(".//luka")
 538
 539     def solution(self, piece):
 540         piece = deepcopy(piece)
 541         piece.tail = None
 542         sub = EduModule()
 543         return sub.generate(piece)
 544
 545     def handle_pytanie(self, element):
 546         qpre, qpost = super(Luki, self).handle_pytanie(element)
 547
 548         luki = self.find_pieces(element)
 549         random.shuffle(luki)
 550         self.words = u"<env name='itemize'>%s</env>" % (
 551             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 552         )
 553         return qpre, qpost
 554
 555     def handle_opis(self, element):
 556         return '', self.words
 557
 558     def handle_luka(self, element):
 559         luka = "_" * 10
 560         if self.options['teacher']:
 561             piece = deepcopy(element)
 562             piece.tail = None
 563             sub = EduModule()
 564             text = sub.generate(piece)
 565             luka += u" [rozwiązanie: %s]" % text
 566         return luka
 567
 568
 569 class Zastap(Luki):
 570     def find_pieces(self, question):
 571         return question.xpath(".//zastap")
 572
 573     def solution(self, piece):
 574         return piece.attrib.get('rozw', '')
 575
 576     def list_header(self):
 577         return u"Elementy do wstawienia"
 578
 579     def handle_zastap(self, element):
 580         piece = deepcopy(element)
 581         piece.tail = None
 582         sub = EduModule()
 583         text = sub.generate(piece)
 584         if self.options['teacher'] and element.attrib.get('rozw'):
 585             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 586         return text
 587
 588
 589 class PrawdaFalsz(Exercise):
 590     def handle_punkt(self, element):
 591         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 592         if 'rozw' in element.attrib:
 593             post += u" [Prawda/Fałsz]"
 594         return pre, post
 595
 596
 597
 598 def fix_lists(tree):
 599     lists = tree.xpath(".//lista")
 600     for l in lists:
 601         if l.text:
 602             p = l.getprevious()
 603             if p is not None:
 604                 if p.tail is None: p.tail = ''
 605                 p.tail += l.text
 606             else:
 607                 p = l.getparent()
 608                 if p.text is None: p.text = ''
 609                 p.text += l.text
 610             l.text = ''
 611     return tree
 612
 613
 614 class EduModulePDFFormat(PDFFormat):
 615     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 616
 617     def get_texml(self):
 618         substitute_hyphens(self.wldoc.edoc)
 619         fix_hanging(self.wldoc.edoc)
 620
 621         self.attachments = {}
 622         edumod = EduModule({
 623             "wldoc": self.wldoc,
 624             "format": self,
 625             "teacher": self.customization.get('teacher'),
 626         })
 627         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 628
 629         open("/tmp/texml.xml", "w").write(texml)
 630         return texml
 631
 632     def get_tex_dir(self):
 633         temp = super(EduModulePDFFormat, self).get_tex_dir()
 634         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 635         for name, iofile in self.attachments.items():
 636             iofile.save_as(os.path.join(temp, name))
 637         return temp
 638
 639     def get_image(self, name):
 640         return self.wldoc.source.attachments[name]
 641