librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None, state=None):
  70         super(EduModule, self).__init__(options, state)
  71         self.activity_counter = 0
  72         self.exercise_counter = 0
  73
  74         def swap_endlines(txt):
  75             if self.options['strofa']:
  76                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  77             return txt
  78         self.register_text_filter(swap_endlines)
  79         self.register_text_filter(functions.substitute_entities)
  80         self.register_text_filter(mark_alien_characters)
  81
  82     def get_dc(self, element, dc_field, single=False):
  83         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  84         if single:
  85             return values[0]
  86         return values
  87
  88     def handle_rdf__RDF(self, _):
  89         "skip metadata in generation"
  90         return
  91
  92     @escape(True)
  93     def get_rightsinfo(self, element):
  94         rights_lic = self.get_dc(element, 'rights.license', True)
  95         return u'<cmd name="rightsinfostr">' + \
  96           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  97           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  98           u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors)
 110
 111     @escape(1)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     def handle_utwor(self, element):
 116         lines = [
 117             u'''
 118     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 119         <TeXML escape="0">
 120         \\documentclass[%s]{wl}
 121         \\usepackage{style}''' % self.options['customization_str'],
 122     self.options['has_cover'] and '\usepackage{makecover}',
 123     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 124     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 125     (self.options['morefloats'] == 'none' and
 126      u'''\\IfFileExists{morefloats.sty}{
 127             \\usepackage{morefloats}
 128         }{}'''),
 129     u'''\\def\\authors{%s}''' % self.get_authors(element),
 130     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 131     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 132     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 133
 134     u'''\\author{\\authors}''',
 135     u'''\\title{%s}''' % self.get_title(element),
 136     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 137     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 138     u'</TeXML>']
 139
 140         return u"".join(filter(None, lines)), u'</TeXML>'
 141
 142
 143     @escape(1)
 144     def handle_powiesc(self, element):
 145         return u"""
 146     <env name="document">
 147     <cmd name="maketitle"/>
 148     """, """<cmd name="editorialsection" /></env>"""
 149
 150     @escape(1)
 151     def handle_texcommand(self, element):
 152         cmd = functions.texcommand(element.tag)
 153         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 154
 155     handle_akap = \
 156     handle_akap = \
 157     handle_akap_cd = \
 158     handle_akap_cd = \
 159     handle_akap_dialog = \
 160     handle_akap_dialog = \
 161     handle_autor_utworu = \
 162     handle_dedykacja = \
 163     handle_didaskalia = \
 164     handle_didask_tekst = \
 165     handle_dlugi_cytat = \
 166     handle_dzielo_nadrzedne = \
 167     handle_lista_osoba = \
 168     handle_mat = \
 169     handle_miejsce_czas = \
 170     handle_motto = \
 171     handle_motto_podpis = \
 172     handle_naglowek_akt = \
 173     handle_naglowek_czesc = \
 174     handle_naglowek_listy = \
 175     handle_naglowek_osoba = \
 176     handle_naglowek_podrozdzial = \
 177     handle_naglowek_scena = \
 178     handle_nazwa_utworu = \
 179     handle_nota = \
 180     handle_osoba = \
 181     handle_pa = \
 182     handle_pe = \
 183     handle_podtytul = \
 184     handle_poezja_cyt = \
 185     handle_pr = \
 186     handle_pt = \
 187     handle_sekcja_asterysk = \
 188     handle_sekcja_swiatlo = \
 189     handle_separator_linia = \
 190     handle_slowo_obce = \
 191     handle_srodtytul = \
 192     handle_tytul_dziela = \
 193     handle_wyroznienie = \
 194     handle_dywiz = \
 195     handle_texcommand
 196
 197     def handle_naglowek_rozdzial(self, element):
 198         if not self.options['teacher']:
 199             print element.text
 200             if element.text.startswith((u'Wiedza', u'Zadania', u'Słowniczek')):
 201                 print 'not mute'
 202                 self.state['mute'] = False
 203             else:
 204                 print 'mute'
 205                 self.state['mute'] = True
 206                 return None
 207         return self.handle_texcommand(element)
 208     handle_naglowek_rozdzial.unmuter = True
 209
 210
 211     def handle_uwaga(self, _e):
 212         return None
 213     def handle_extra(self, _e):
 214         return None
 215
 216     def handle_nbsp(self, _e):
 217         return '<spec cat="tilde" />'
 218
 219     _handle_strofa = cmd("strofa")
 220
 221     def handle_strofa(self, element):
 222         self.options = {'strofa': True}
 223         return self._handle_strofa(element)
 224
 225     def handle_aktywnosc(self, element):
 226         self.activity_counter += 1
 227         self.options = {
 228             'activity': True,
 229             'activity_counter': self.activity_counter,
 230             'sub_gen': True,
 231         }
 232         submill = EduModule(self.options, self.state)
 233
 234         if element.xpath('opis'):
 235             opis = submill.generate(element.xpath('opis')[0])
 236         else:
 237             opis = ''
 238
 239         n = element.xpath('wskazowki')
 240         if n: wskazowki = submill.generate(n[0])
 241
 242         else: wskazowki = ''
 243         n = element.xpath('pomoce')
 244
 245         if n: pomoce = submill.generate(n[0])
 246         else: pomoce = ''
 247
 248         forma = ''.join(element.xpath('forma/text()'))
 249
 250         czas = ''.join(element.xpath('czas/text()'))
 251
 252         counter = self.activity_counter
 253
 254         return u"""
 255 <cmd name="noindent" />
 256 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 257 <cmd name="activityinfo"><parm>
 258  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 259  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 260  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 261 </parm></cmd>
 262
 263
 264 %(opis)s
 265
 266 %(wskazowki)s
 267 """ % locals()
 268
 269     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 270     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 271
 272     @ifoption(sub_gen=True)
 273     def handle_pomoce(self, _):
 274         return "Pomoce: ", ""
 275
 276     def handle_czas(self, *_):
 277         return
 278
 279     def handle_forma(self, *_):
 280         return
 281
 282     def handle_lista(self, element, attrs={}):
 283         ltype = element.attrib.get('typ', 'punkt')
 284         if not element.findall("punkt"):
 285             if ltype == 'czytelnia':
 286                 return 'W przygotowaniu.'
 287             else:
 288                 return None
 289         if ltype == 'slowniczek':
 290             surl = element.attrib.get('src', None)
 291             if surl is None:
 292                 # print '** missing src on <slowniczek>, setting default'
 293                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 294             sxml = None
 295             if surl:
 296                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 297             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 298
 299         listcmd = {'num': 'enumerate',
 300                'punkt': 'itemize',
 301                'alfa': 'itemize',
 302                'slowniczek': 'itemize',
 303                'czytelnia': 'itemize'}[ltype]
 304
 305         return u'<env name="%s">' % listcmd, u'</env>'
 306
 307     def handle_punkt(self, element):
 308         return '<cmd name="item"/>', ''
 309
 310     def handle_cwiczenie(self, element):
 311         exercise_handlers = {
 312             'wybor': Wybor,
 313             'uporzadkuj': Uporzadkuj,
 314             'luki': Luki,
 315             'zastap': Zastap,
 316             'przyporzadkuj': Przyporzadkuj,
 317             'prawdafalsz': PrawdaFalsz
 318         }
 319
 320         typ = element.attrib['typ']
 321         self.exercise_counter += 1
 322         if not typ in exercise_handlers:
 323             return '(no handler)'
 324         self.options = {'exercise_counter': self.exercise_counter}
 325         handler = exercise_handlers[typ](self.options, self.state)
 326         return handler.generate(element)
 327
 328     # XXX this is copied from pyhtml.py, except for return and
 329     # should be refactored for no code duplication
 330     def handle_definiendum(self, element):
 331         nxt = element.getnext()
 332         definiens_s = ''
 333
 334         # let's pull definiens from another document
 335         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 336             sxml = self.options['slowniczek_xml']
 337             assert element.text != ''
 338             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 339             if defloc:
 340                 definiens = defloc[0].getnext()
 341                 if definiens.tag == 'definiens':
 342                     subgen = EduModule(self.options, self.state)
 343                     definiens_s = subgen.generate(definiens)
 344
 345         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 346
 347     def handle_definiens(self, element):
 348         return u"", u""
 349
 350     def handle_podpis(self, element):
 351         return u"""<env name="figure">""", u"</env>"
 352
 353     def handle_tabela(self, element):
 354         max_col = 0
 355         for w in element.xpath("wiersz"):
 356             ks = w.xpath("kol")
 357             if max_col < len(ks):
 358                 max_col = len(ks)
 359         self.options = {'columnts': max_col}
 360         # styling:
 361                 #        has_frames = int(element.attrib.get("ramki", "0"))
 362                 #        if has_frames: frames_c = "framed"
 363                 #        else: frames_c = ""
 364                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 365         return u'''
 366 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 367     ''' % ('l' * max_col), \
 368     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 369
 370     @escape(1)
 371     def handle_wiersz(self, element):
 372         return u"", u'<ctrl ch="\\"/>'
 373
 374     @escape(1)
 375     def handle_kol(self, element):
 376         if element.getnext() is not None:
 377             return u"", u'<spec cat="align" />'
 378         return u"", u""
 379
 380     def handle_link(self, element):
 381         if element.attrib.get('url'):
 382             url = element.attrib.get('url')
 383             if url == element.text:
 384                 return cmd('url')(self, element)
 385             else:
 386                 return cmd('href', parms=[element.attrib['url']])(self, element)
 387         else:
 388             return cmd('emph')(self, element)
 389
 390     def handle_obraz(self, element):
 391         frmt = self.options['format']
 392         name = element.attrib.get('nazwa', '').strip()
 393         image = frmt.get_image(name.strip())
 394         img_path = "obraz/%s" % name.replace("_", "")
 395         frmt.attachments[img_path] = image
 396         return cmd("obraz", parms=[img_path])(self)
 397
 398     def handle_video(self, element):
 399         url = element.attrib.get('url')
 400         if not url:
 401             print '!! <video> missing url'
 402             return
 403         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 404         if not m:
 405             print '!! unknown <video> url scheme:', url
 406             return
 407         name = m.group(1)
 408         thumb = IOFile.from_string(urlopen
 409             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 410         img_path = "video/%s.jpg" % name.replace("_", "")
 411         self.options['format'].attachments[img_path] = thumb
 412         canon_url = "https://www.youtube.com/watch?v=%s" % name
 413         return cmd("video", parms=[img_path, canon_url])(self)
 414
 415
 416 class Exercise(EduModule):
 417     def __init__(self, *args, **kw):
 418         self.question_counter = 0
 419         super(Exercise, self).__init__(*args, **kw)
 420
 421     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 422
 423     def handle_cwiczenie(self, element):
 424         self.options = {
 425             'exercise': element.attrib['typ'],
 426             'sub_gen': True,
 427         }
 428         self.question_counter = 0
 429         self.piece_counter = 0
 430
 431         header = etree.Element("parm")
 432         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 433         header_cmd.append(header)
 434         header.text = u"Zadanie %d." % self.options['exercise_counter']
 435
 436         pre = etree.tostring(header_cmd, encoding=unicode)
 437         post = u""
 438         # Add a single <pytanie> tag if it's not there
 439         if not element.xpath(".//pytanie"):
 440             qpre, qpost = self.handle_pytanie(element)
 441             pre = pre + qpre
 442             post = qpost + post
 443         return pre, post
 444
 445     def handle_pytanie(self, element):
 446         """This will handle <cwiczenie> element, when there is no <pytanie>
 447         """
 448         self.question_counter += 1
 449         self.piece_counter = 0
 450         pre = post = u""
 451         if self.options['teacher'] and element.attrib.get('rozw'):
 452             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 453         return pre, post
 454
 455     def handle_punkt(self, element):
 456         pre, post = super(Exercise, self).handle_punkt(element)
 457         if self.options['teacher'] and element.attrib.get('rozw'):
 458             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 459         return pre, post
 460
 461     def solution_header(self):
 462         par = etree.Element("cmd", name="par")
 463         parm = etree.Element("parm")
 464         parm.text = u"Rozwiązanie:"
 465         par.append(parm)
 466         return etree.tostring(par)
 467
 468     def explicit_solution(self):
 469         if self.options['solution']:
 470             par = etree.Element("cmd", name="par")
 471             parm = etree.Element("parm")
 472             parm.text = self.options['solution']
 473             par.append(parm)
 474             return self.solution_header() + etree.tostring(par)
 475
 476
 477
 478 class Wybor(Exercise):
 479     def handle_cwiczenie(self, element):
 480         pre, post = super(Wybor, self).handle_cwiczenie(element)
 481         is_single_choice = True
 482         pytania = element.xpath(".//pytanie")
 483         if not pytania:
 484             pytania = [element]
 485         for p in pytania:
 486             solutions = re.split(r"[, ]+", p.attrib.get('rozw', ''))
 487             if len(solutions) != 1:
 488                 is_single_choice = False
 489                 break
 490             choices = p.xpath(".//*[@nazwa]")
 491             uniq = set()
 492             for n in choices: uniq.add(n.attrib.get('nazwa', ''))
 493             if len(choices) != len(uniq):
 494                 is_single_choice = False
 495                 break
 496
 497         self.options = {'single': is_single_choice}
 498         return pre, post
 499
 500     def handle_punkt(self, element):
 501         if self.options['exercise'] and element.attrib.get('nazwa', None):
 502             cmd = 'radio' if self.options['single'] else 'checkbox'
 503             return u'<cmd name="%s"/>' % cmd, ''
 504         else:
 505             return super(Wybor, self).handle_punkt(element)
 506
 507
 508 class Uporzadkuj(Exercise):
 509     def handle_pytanie(self, element):
 510         order_items = element.xpath(".//punkt/@rozw")
 511         return super(Uporzadkuj, self).handle_pytanie(element)
 512
 513
 514 class Przyporzadkuj(Exercise):
 515     def handle_lista(self, lista):
 516         header = etree.Element("parm")
 517         header_cmd = etree.Element("cmd", name="par")
 518         header_cmd.append(header)
 519         if 'nazwa' in lista.attrib:
 520             header.text = u"Kategorie:"
 521         elif 'cel' in lista.attrib:
 522             header.text = u"Elementy do przyporządkowania:"
 523         else:
 524             header.text = u"Lista:"
 525         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 526         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 527         return pre, post
 528
 529
 530 class Luki(Exercise):
 531     def find_pieces(self, question):
 532         return question.xpath(".//luka")
 533
 534     def solution(self, piece):
 535         piece = deepcopy(piece)
 536         piece.tail = None
 537         sub = EduModule()
 538         return sub.generate(piece)
 539
 540     def handle_pytanie(self, element):
 541         qpre, qpost = super(Luki, self).handle_pytanie(element)
 542
 543         luki = self.find_pieces(element)
 544         random.shuffle(luki)
 545         self.words = u"<env name='itemize'>%s</env>" % (
 546             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 547         )
 548         return qpre, qpost
 549
 550     def handle_opis(self, element):
 551         return '', self.words
 552
 553     def handle_luka(self, element):
 554         luka = "_" * 10
 555         if self.options['teacher']:
 556             piece = deepcopy(element)
 557             piece.tail = None
 558             sub = EduModule()
 559             text = sub.generate(piece)
 560             luka += u" [rozwiązanie: %s]" % text
 561         return luka
 562
 563
 564 class Zastap(Luki):
 565     def find_pieces(self, question):
 566         return question.xpath(".//zastap")
 567
 568     def solution(self, piece):
 569         return piece.attrib.get('rozw', '')
 570
 571     def list_header(self):
 572         return u"Elementy do wstawienia"
 573
 574     def handle_zastap(self, element):
 575         piece = deepcopy(element)
 576         piece.tail = None
 577         sub = EduModule()
 578         text = sub.generate(piece)
 579         if self.options['teacher'] and element.attrib.get('rozw'):
 580             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 581         return text
 582
 583
 584 class PrawdaFalsz(Exercise):
 585     def handle_punkt(self, element):
 586         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 587         if 'rozw' in element.attrib:
 588             post += u" [Prawda/Fałsz]"
 589         return pre, post
 590
 591
 592
 593 def fix_lists(tree):
 594     lists = tree.xpath(".//lista")
 595     for l in lists:
 596         if l.text:
 597             p = l.getprevious()
 598             if p is not None:
 599                 if p.tail is None: p.tail = ''
 600                 p.tail += l.text
 601             else:
 602                 p = l.getparent()
 603                 if p.text is None: p.text = ''
 604                 p.text += l.text
 605             l.text = ''
 606     return tree
 607
 608
 609 class EduModulePDFFormat(PDFFormat):
 610     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 611
 612     def get_texml(self):
 613         substitute_hyphens(self.wldoc.edoc)
 614         fix_hanging(self.wldoc.edoc)
 615
 616         self.attachments = {}
 617         edumod = EduModule({
 618             "wldoc": self.wldoc,
 619             "format": self,
 620             "teacher": self.customization.get('teacher'),
 621         })
 622         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 623
 624         open("/tmp/texml.xml", "w").write(texml)
 625         return texml
 626
 627     def get_tex_dir(self):
 628         temp = super(EduModulePDFFormat, self).get_tex_dir()
 629         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 630         for name, iofile in self.attachments.items():
 631             iofile.save_as(os.path.join(temp, name))
 632         return temp
 633
 634     def get_image(self, name):
 635         return self.wldoc.source.attachments[name]
 636