librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.exercise_counter = 0
  73
  74         def swap_endlines(txt):
  75             if self.options['strofa']:
  76                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  77             return txt
  78         self.register_text_filter(swap_endlines)
  79         self.register_text_filter(functions.substitute_entities)
  80         self.register_text_filter(mark_alien_characters)
  81
  82     def get_dc(self, element, dc_field, single=False):
  83         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  84         if single:
  85             return values[0]
  86         return values
  87
  88     def handle_rdf__RDF(self, _):
  89         "skip metadata in generation"
  90         return
  91
  92     @escape(True)
  93     def get_rightsinfo(self, element):
  94         rights_lic = self.get_dc(element, 'rights.license', True)
  95         return u'<cmd name="rightsinfostr">' + \
  96           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  97           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  98           u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors)
 110
 111     @escape(1)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     def handle_utwor(self, element):
 116         lines = [
 117             u'''
 118     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 119         <TeXML escape="0">
 120         \\documentclass[%s]{wl}
 121         \\usepackage{style}''' % self.options['customization_str'],
 122     self.options['has_cover'] and '\usepackage{makecover}',
 123     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 124     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 125     (self.options['morefloats'] == 'none' and
 126      u'''\\IfFileExists{morefloats.sty}{
 127             \\usepackage{morefloats}
 128         }{}'''),
 129     u'''\\def\\authors{%s}''' % self.get_authors(element),
 130     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 131     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 132     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 133
 134     u'''\\author{\\authors}''',
 135     u'''\\title{%s}''' % self.get_title(element),
 136     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 137     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 138     u'</TeXML>']
 139
 140         return u"".join(filter(None, lines)), u'</TeXML>'
 141
 142
 143     @escape(1)
 144     def handle_powiesc(self, element):
 145         return u"""
 146     <env name="document">
 147     <cmd name="maketitle"/>
 148     """, """<cmd name="editorialsection" /></env>"""
 149
 150     @escape(1)
 151     def handle_texcommand(self, element):
 152         cmd = functions.texcommand(element.tag)
 153         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 154
 155     handle_akap = \
 156     handle_akap = \
 157     handle_akap_cd = \
 158     handle_akap_cd = \
 159     handle_akap_dialog = \
 160     handle_akap_dialog = \
 161     handle_autor_utworu = \
 162     handle_dedykacja = \
 163     handle_didaskalia = \
 164     handle_didask_tekst = \
 165     handle_dlugi_cytat = \
 166     handle_dzielo_nadrzedne = \
 167     handle_lista_osoba = \
 168     handle_mat = \
 169     handle_miejsce_czas = \
 170     handle_motto = \
 171     handle_motto_podpis = \
 172     handle_naglowek_akt = \
 173     handle_naglowek_czesc = \
 174     handle_naglowek_listy = \
 175     handle_naglowek_osoba = \
 176     handle_naglowek_podrozdzial = \
 177     handle_naglowek_scena = \
 178     handle_nazwa_utworu = \
 179     handle_nota = \
 180     handle_osoba = \
 181     handle_pa = \
 182     handle_pe = \
 183     handle_podtytul = \
 184     handle_poezja_cyt = \
 185     handle_pr = \
 186     handle_pt = \
 187     handle_sekcja_asterysk = \
 188     handle_sekcja_swiatlo = \
 189     handle_separator_linia = \
 190     handle_slowo_obce = \
 191     handle_srodtytul = \
 192     handle_tytul_dziela = \
 193     handle_wyroznienie = \
 194     handle_dywiz = \
 195     handle_texcommand
 196
 197     def handle_naglowek_rozdzial(self, element):
 198         if not self.options['teacher']:
 199             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek')):
 200                 self.state['mute'] = False
 201             else:
 202                 self.state['mute'] = True
 203                 return None
 204         return self.handle_texcommand(element)
 205     handle_naglowek_rozdzial.unmuter = True
 206
 207
 208     def handle_uwaga(self, _e):
 209         return None
 210     def handle_extra(self, _e):
 211         return None
 212
 213     def handle_nbsp(self, _e):
 214         return '<spec cat="tilde" />'
 215
 216     _handle_strofa = cmd("strofa")
 217
 218     def handle_strofa(self, element):
 219         self.options = {'strofa': True}
 220         return self._handle_strofa(element)
 221
 222     def handle_aktywnosc(self, element):
 223         self.activity_counter += 1
 224         self.options = {
 225             'activity': True,
 226             'activity_counter': self.activity_counter,
 227             'sub_gen': True,
 228         }
 229         submill = EduModule(self.options, self.state)
 230
 231         if element.xpath('opis'):
 232             opis = submill.generate(element.xpath('opis')[0])
 233         else:
 234             opis = ''
 235
 236         n = element.xpath('wskazowki')
 237         if n: wskazowki = submill.generate(n[0])
 238
 239         else: wskazowki = ''
 240         n = element.xpath('pomoce')
 241
 242         if n: pomoce = submill.generate(n[0])
 243         else: pomoce = ''
 244
 245         forma = ''.join(element.xpath('forma/text()'))
 246
 247         czas = ''.join(element.xpath('czas/text()'))
 248
 249         counter = self.activity_counter
 250
 251         return u"""
 252 <cmd name="noindent" />
 253 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 254 <cmd name="activityinfo"><parm>
 255  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 256  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 257  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 258 </parm></cmd>
 259
 260
 261 %(opis)s
 262
 263 %(wskazowki)s
 264 """ % locals()
 265
 266     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 267     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 268
 269     @ifoption(sub_gen=True)
 270     def handle_pomoce(self, _):
 271         return "Pomoce: ", ""
 272
 273     def handle_czas(self, *_):
 274         return
 275
 276     def handle_forma(self, *_):
 277         return
 278
 279     def handle_lista(self, element, attrs={}):
 280         ltype = element.attrib.get('typ', 'punkt')
 281         if not element.findall("punkt"):
 282             if ltype == 'czytelnia':
 283                 return 'W przygotowaniu.'
 284             else:
 285                 return None
 286         if ltype == 'slowniczek':
 287             surl = element.attrib.get('src', None)
 288             if surl is None:
 289                 # print '** missing src on <slowniczek>, setting default'
 290                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 291             sxml = None
 292             if surl:
 293                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 294             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 295
 296         listcmd = {'num': 'enumerate',
 297                'punkt': 'itemize',
 298                'alfa': 'itemize',
 299                'slowniczek': 'itemize',
 300                'czytelnia': 'itemize'}[ltype]
 301
 302         return u'<env name="%s">' % listcmd, u'</env>'
 303
 304     def handle_punkt(self, element):
 305         return '<cmd name="item"/>', ''
 306
 307     def handle_cwiczenie(self, element):
 308         exercise_handlers = {
 309             'wybor': Wybor,
 310             'uporzadkuj': Uporzadkuj,
 311             'luki': Luki,
 312             'zastap': Zastap,
 313             'przyporzadkuj': Przyporzadkuj,
 314             'prawdafalsz': PrawdaFalsz
 315         }
 316
 317         typ = element.attrib['typ']
 318         self.exercise_counter += 1
 319         if not typ in exercise_handlers:
 320             return '(no handler)'
 321         self.options = {'exercise_counter': self.exercise_counter}
 322         handler = exercise_handlers[typ](self.options, self.state)
 323         return handler.generate(element)
 324
 325     # XXX this is copied from pyhtml.py, except for return and
 326     # should be refactored for no code duplication
 327     def handle_definiendum(self, element):
 328         nxt = element.getnext()
 329         definiens_s = ''
 330
 331         # let's pull definiens from another document
 332         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 333             sxml = self.options['slowniczek_xml']
 334             assert element.text != ''
 335             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 336             if defloc:
 337                 definiens = defloc[0].getnext()
 338                 if definiens.tag == 'definiens':
 339                     subgen = EduModule(self.options, self.state)
 340                     definiens_s = subgen.generate(definiens)
 341
 342         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 343
 344     def handle_definiens(self, element):
 345         return u"", u""
 346
 347     def handle_podpis(self, element):
 348         return u"""<env name="figure">""", u"</env>"
 349
 350     def handle_tabela(self, element):
 351         max_col = 0
 352         for w in element.xpath("wiersz"):
 353             ks = w.xpath("kol")
 354             if max_col < len(ks):
 355                 max_col = len(ks)
 356         self.options = {'columnts': max_col}
 357         # styling:
 358                 #        has_frames = int(element.attrib.get("ramki", "0"))
 359                 #        if has_frames: frames_c = "framed"
 360                 #        else: frames_c = ""
 361                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 362         return u'''
 363 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 364     ''' % ('l' * max_col), \
 365     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 366
 367     @escape(1)
 368     def handle_wiersz(self, element):
 369         return u"", u'<ctrl ch="\\"/>'
 370
 371     @escape(1)
 372     def handle_kol(self, element):
 373         if element.getnext() is not None:
 374             return u"", u'<spec cat="align" />'
 375         return u"", u""
 376
 377     def handle_link(self, element):
 378         if element.attrib.get('url'):
 379             url = element.attrib.get('url')
 380             if url == element.text:
 381                 return cmd('url')(self, element)
 382             else:
 383                 return cmd('href', parms=[element.attrib['url']])(self, element)
 384         else:
 385             return cmd('emph')(self, element)
 386
 387     def handle_obraz(self, element):
 388         frmt = self.options['format']
 389         name = element.attrib.get('nazwa', '').strip()
 390         image = frmt.get_image(name.strip())
 391         img_path = "obraz/%s" % name.replace("_", "")
 392         frmt.attachments[img_path] = image
 393         return cmd("obraz", parms=[img_path])(self)
 394
 395     def handle_video(self, element):
 396         url = element.attrib.get('url')
 397         if not url:
 398             print '!! <video> missing url'
 399             return
 400         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 401         if not m:
 402             print '!! unknown <video> url scheme:', url
 403             return
 404         name = m.group(1)
 405         thumb = IOFile.from_string(urlopen
 406             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 407         img_path = "video/%s.jpg" % name.replace("_", "")
 408         self.options['format'].attachments[img_path] = thumb
 409         canon_url = "https://www.youtube.com/watch?v=%s" % name
 410         return cmd("video", parms=[img_path, canon_url])(self)
 411
 412
 413 class Exercise(EduModule):
 414     def __init__(self, *args, **kw):
 415         self.question_counter = 0
 416         super(Exercise, self).__init__(*args, **kw)
 417
 418     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 419
 420     def handle_cwiczenie(self, element):
 421         self.options = {
 422             'exercise': element.attrib['typ'],
 423             'sub_gen': True,
 424         }
 425         self.question_counter = 0
 426         self.piece_counter = 0
 427
 428         header = etree.Element("parm")
 429         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 430         header_cmd.append(header)
 431         header.text = u"Zadanie %d." % self.options['exercise_counter']
 432
 433         pre = etree.tostring(header_cmd, encoding=unicode)
 434         post = u""
 435         # Add a single <pytanie> tag if it's not there
 436         if not element.xpath(".//pytanie"):
 437             qpre, qpost = self.handle_pytanie(element)
 438             pre = pre + qpre
 439             post = qpost + post
 440         return pre, post
 441
 442     def handle_pytanie(self, element):
 443         """This will handle <cwiczenie> element, when there is no <pytanie>
 444         """
 445         self.question_counter += 1
 446         self.piece_counter = 0
 447         pre = post = u""
 448         if self.options['teacher'] and element.attrib.get('rozw'):
 449             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 450         return pre, post
 451
 452     def handle_punkt(self, element):
 453         pre, post = super(Exercise, self).handle_punkt(element)
 454         if self.options['teacher'] and element.attrib.get('rozw'):
 455             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 456         return pre, post
 457
 458     def solution_header(self):
 459         par = etree.Element("cmd", name="par")
 460         parm = etree.Element("parm")
 461         parm.text = u"Rozwiązanie:"
 462         par.append(parm)
 463         return etree.tostring(par)
 464
 465     def explicit_solution(self):
 466         if self.options['solution']:
 467             par = etree.Element("cmd", name="par")
 468             parm = etree.Element("parm")
 469             parm.text = self.options['solution']
 470             par.append(parm)
 471             return self.solution_header() + etree.tostring(par)
 472
 473
 474
 475 class Wybor(Exercise):
 476     def handle_cwiczenie(self, element):
 477         pre, post = super(Wybor, self).handle_cwiczenie(element)
 478         is_single_choice = True
 479         pytania = element.xpath(".//pytanie")
 480         if not pytania:
 481             pytania = [element]
 482         for p in pytania:
 483             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 484             if len(solutions) != 1:
 485                 is_single_choice = False
 486                 break
 487             choices = p.xpath(".//*[@nazwa]")
 488             uniq = set()
 489             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 490             if len(choices) != len(uniq):
 491                 is_single_choice = False
 492                 break
 493
 494         self.options = {'single': is_single_choice}
 495         return pre, post
 496
 497     def handle_punkt(self, element):
 498         if self.options['exercise'] and element.attrib.get('nazwa', None):
 499             cmd = 'radio' if self.options['single'] else 'checkbox'
 500             return u'<cmd name="%s"/>' % cmd, ''
 501         else:
 502             return super(Wybor, self).handle_punkt(element)
 503
 504
 505 class Uporzadkuj(Exercise):
 506     def handle_pytanie(self, element):
 507         order_items = element.xpath(".//punkt/@rozw")
 508         return super(Uporzadkuj, self).handle_pytanie(element)
 509
 510
 511 class Przyporzadkuj(Exercise):
 512     def handle_lista(self, lista):
 513         header = etree.Element("parm")
 514         header_cmd = etree.Element("cmd", name="par")
 515         header_cmd.append(header)
 516         if 'nazwa' in lista.attrib:
 517             header.text = u"Kategorie:"
 518         elif 'cel' in lista.attrib:
 519             header.text = u"Elementy do przyporządkowania:"
 520         else:
 521             header.text = u"Lista:"
 522         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 523         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 524         return pre, post
 525
 526
 527 class Luki(Exercise):
 528     def find_pieces(self, question):
 529         return question.xpath(".//luka")
 530
 531     def solution(self, piece):
 532         piece = deepcopy(piece)
 533         piece.tail = None
 534         sub = EduModule()
 535         return sub.generate(piece)
 536
 537     def handle_pytanie(self, element):
 538         qpre, qpost = super(Luki, self).handle_pytanie(element)
 539
 540         luki = self.find_pieces(element)
 541         random.shuffle(luki)
 542         self.words = u"<env name='itemize'>%s</env>" % (
 543             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 544         )
 545         return qpre, qpost
 546
 547     def handle_opis(self, element):
 548         return '', self.words
 549
 550     def handle_luka(self, element):
 551         luka = "_" * 10
 552         if self.options['teacher']:
 553             piece = deepcopy(element)
 554             piece.tail = None
 555             sub = EduModule()
 556             text = sub.generate(piece)
 557             luka += u" [rozwiązanie: %s]" % text
 558         return luka
 559
 560
 561 class Zastap(Luki):
 562     def find_pieces(self, question):
 563         return question.xpath(".//zastap")
 564
 565     def solution(self, piece):
 566         return piece.attrib.get('rozw', '')
 567
 568     def list_header(self):
 569         return u"Elementy do wstawienia"
 570
 571     def handle_zastap(self, element):
 572         piece = deepcopy(element)
 573         piece.tail = None
 574         sub = EduModule()
 575         text = sub.generate(piece)
 576         if self.options['teacher'] and element.attrib.get('rozw'):
 577             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 578         return text
 579
 580
 581 class PrawdaFalsz(Exercise):
 582     def handle_punkt(self, element):
 583         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 584         if 'rozw' in element.attrib:
 585             post += u" [Prawda/Fałsz]"
 586         return pre, post
 587
 588
 589
 590 def fix_lists(tree):
 591     lists = tree.xpath(".//lista")
 592     for l in lists:
 593         if l.text:
 594             p = l.getprevious()
 595             if p is not None:
 596                 if p.tail is None: p.tail = ''
 597                 p.tail += l.text
 598             else:
 599                 p = l.getparent()
 600                 if p.text is None: p.text = ''
 601                 p.text += l.text
 602             l.text = ''
 603     return tree
 604
 605
 606 class EduModulePDFFormat(PDFFormat):
 607     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 608
 609     def get_texml(self):
 610         substitute_hyphens(self.wldoc.edoc)
 611         fix_hanging(self.wldoc.edoc)
 612
 613         self.attachments = {}
 614         edumod = EduModule({
 615             "wldoc": self.wldoc,
 616             "format": self,
 617             "teacher": self.customization.get('teacher'),
 618         })
 619         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 620
 621         open("/tmp/texml.xml", "w").write(texml)
 622         return texml
 623
 624     def get_tex_dir(self):
 625         temp = super(EduModulePDFFormat, self).get_tex_dir()
 626         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 627         for name, iofile in self.attachments.items():
 628             iofile.save_as(os.path.join(temp, name))
 629         return temp
 630
 631     def get_image(self, name):
 632         return self.wldoc.source.attachments[name]
 633