librarian/pypdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from copy import deepcopy
  13 import os.path
  14 import shutil
  15 import re
  16 import random
  17 from urllib2 import urlopen
  18
  19 from lxml import etree
  20
  21 from xmlutils import Xmill, tag, tagged, ifoption, tag_open_close
  22 from librarian.dcparser import Person
  23 from librarian import DCNS, get_resource, IOFile
  24 from librarian import functions
  25 from pdf import PDFFormat, substitute_hyphens, fix_hanging
  26
  27
  28 def escape(really):
  29     def deco(f):
  30         def _wrap(*args, **kw):
  31             value = f(*args, **kw)
  32
  33             prefix = (u'<TeXML escape="%d">' % (really and 1 or 0))
  34             postfix = u'</TeXML>'
  35             if isinstance(value, list):
  36                 import pdb; pdb.set_trace()
  37             if isinstance(value, tuple):
  38                 return prefix + value[0], value[1] + postfix
  39             else:
  40                 return prefix + value + postfix
  41         return _wrap
  42     return deco
  43
  44
  45 def cmd(name, parms=None):
  46     def wrap(self, element=None):
  47         pre, post = tag_open_close('cmd', name=name)
  48
  49         if parms:
  50             for parm in parms:
  51                 e = etree.Element("parm")
  52                 e.text = parm
  53                 pre += etree.tostring(e)
  54         if element is not None:
  55             pre += "<parm>"
  56             post = "</parm>" + post
  57             return pre, post
  58         else:
  59             return pre + post
  60     return wrap
  61
  62
  63 def mark_alien_characters(text):
  64     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  65     return text
  66
  67
  68 class EduModule(Xmill):
  69     def __init__(self, options=None):
  70         super(EduModule, self).__init__(options)
  71         self.activity_counter = 0
  72         self.exercise_counter = 0
  73
  74         def swap_endlines(txt):
  75             if self.options['strofa']:
  76                 txt = txt.replace("/\n", '<ctrl ch="\\"/>')
  77             return txt
  78         self.register_text_filter(swap_endlines)
  79         self.register_text_filter(functions.substitute_entities)
  80         self.register_text_filter(mark_alien_characters)
  81
  82     def get_dc(self, element, dc_field, single=False):
  83         values = map(lambda t: t.text, element.xpath("//dc:%s" % dc_field, namespaces={'dc': DCNS.uri}))
  84         if single:
  85             return values[0]
  86         return values
  87
  88     def handle_rdf__RDF(self, _):
  89         "skip metadata in generation"
  90         return
  91
  92     @escape(True)
  93     def get_rightsinfo(self, element):
  94         rights_lic = self.get_dc(element, 'rights.license', True)
  95         return u'<cmd name="rightsinfostr">' + \
  96           (rights_lic and u'<opt>%s</opt>' % rights_lic or '') +\
  97           u'<parm>%s</parm>' % self.get_dc(element, 'rights', True) +\
  98           u'</cmd>'
  99
 100     @escape(True)
 101     def get_authors(self, element, which=None):
 102         dc = self.options['wldoc'].book_info
 103         if which is None:
 104             authors = dc.authors_textbook + \
 105                 dc.authors_scenario + \
 106                 dc.authors_expert
 107         else:
 108             authors = getattr(dc, "authors_%s" % which)
 109         return u', '.join(author.readable() for author in authors)
 110
 111     @escape(1)
 112     def get_title(self, element):
 113         return self.get_dc(element, 'title', True)
 114
 115     def handle_utwor(self, element):
 116         lines = [
 117             u'''
 118     <TeXML xmlns="http://getfo.sourceforge.net/texml/ns1">
 119         <TeXML escape="0">
 120         \\documentclass[%s]{wl}
 121         \\usepackage{style}''' % self.options['customization_str'],
 122     self.options['has_cover'] and '\usepackage{makecover}',
 123     (self.options['morefloats'] == 'new' and '\usepackage[maxfloats=64]{morefloats}') or
 124     (self.options['morefloats'] == 'old' and '\usepackage{morefloats}') or
 125     (self.options['morefloats'] == 'none' and
 126      u'''\\IfFileExists{morefloats.sty}{
 127             \\usepackage{morefloats}
 128         }{}'''),
 129     u'''\\def\\authors{%s}''' % self.get_authors(element),
 130     u'''\\def\\authorsexpert{%s}''' % self.get_authors(element, 'expert'),
 131     u'''\\def\\authorsscenario{%s}''' % self.get_authors(element, 'scenario'),
 132     u'''\\def\\authorstextbook{%s}''' % self.get_authors(element, 'textbook'),
 133
 134     u'''\\author{\\authors}''',
 135     u'''\\title{%s}''' % self.get_title(element),
 136     u'''\\def\\bookurl{%s}''' % self.options['wldoc'].book_info.url.canonical(),
 137     u'''\\def\\rightsinfo{%s}''' % self.get_rightsinfo(element),
 138     u'</TeXML>']
 139
 140         return u"".join(filter(None, lines)), u'</TeXML>'
 141
 142
 143     @escape(1)
 144     def handle_powiesc(self, element):
 145         return u"""
 146     <env name="document">
 147     <cmd name="maketitle"/>
 148     """, """<cmd name="editorialsection" /></env>"""
 149
 150     @escape(1)
 151     def handle_texcommand(self, element):
 152         cmd = functions.texcommand(element.tag)
 153         return u'<TeXML escape="1"><cmd name="%s"><parm>' % cmd, u'</parm></cmd></TeXML>'
 154
 155     handle_akap = \
 156     handle_akap = \
 157     handle_akap_cd = \
 158     handle_akap_cd = \
 159     handle_akap_dialog = \
 160     handle_akap_dialog = \
 161     handle_autor_utworu = \
 162     handle_dedykacja = \
 163     handle_didaskalia = \
 164     handle_didask_tekst = \
 165     handle_dlugi_cytat = \
 166     handle_dzielo_nadrzedne = \
 167     handle_lista_osoba = \
 168     handle_mat = \
 169     handle_miejsce_czas = \
 170     handle_motto = \
 171     handle_motto_podpis = \
 172     handle_naglowek_akt = \
 173     handle_naglowek_czesc = \
 174     handle_naglowek_listy = \
 175     handle_naglowek_osoba = \
 176     handle_naglowek_podrozdzial = \
 177     handle_naglowek_podrozdzial = \
 178     handle_naglowek_rozdzial = \
 179     handle_naglowek_rozdzial = \
 180     handle_naglowek_scena = \
 181     handle_nazwa_utworu = \
 182     handle_nota = \
 183     handle_osoba = \
 184     handle_pa = \
 185     handle_pe = \
 186     handle_podtytul = \
 187     handle_poezja_cyt = \
 188     handle_pr = \
 189     handle_pt = \
 190     handle_sekcja_asterysk = \
 191     handle_sekcja_swiatlo = \
 192     handle_separator_linia = \
 193     handle_slowo_obce = \
 194     handle_srodtytul = \
 195     handle_tytul_dziela = \
 196     handle_wyroznienie = \
 197     handle_dywiz = \
 198     handle_texcommand
 199
 200     def handle_uwaga(self, _e):
 201         return None
 202     def handle_extra(self, _e):
 203         return None
 204
 205     def handle_nbsp(self, _e):
 206         return '<spec cat="tilde" />'
 207
 208     _handle_strofa = cmd("strofa")
 209
 210     def handle_strofa(self, element):
 211         self.options = {'strofa': True}
 212         return self._handle_strofa(element)
 213
 214     def handle_aktywnosc(self, element):
 215         self.activity_counter += 1
 216         self.options = {
 217             'activity': True,
 218             'activity_counter': self.activity_counter,
 219             'sub_gen': True,
 220         }
 221         submill = EduModule(self.options)
 222
 223         opis = submill.generate(element.xpath('opis')[0])
 224
 225         n = element.xpath('wskazowki')
 226         if n: wskazowki = submill.generate(n[0])
 227
 228         else: wskazowki = ''
 229         n = element.xpath('pomoce')
 230
 231         if n: pomoce = submill.generate(n[0])
 232         else: pomoce = ''
 233
 234         forma = ''.join(element.xpath('forma/text()'))
 235
 236         czas = ''.join(element.xpath('czas/text()'))
 237
 238         counter = self.activity_counter
 239
 240         return u"""
 241 <cmd name="noindent" />
 242 <cmd name="activitycounter"><parm>%(counter)d.</parm></cmd>
 243 <cmd name="activityinfo"><parm>
 244  <cmd name="activitytime"><parm>%(czas)s</parm></cmd>
 245  <cmd name="activityform"><parm>%(forma)s</parm></cmd>
 246  <cmd name="activitytools"><parm>%(pomoce)s</parm></cmd>
 247 </parm></cmd>
 248
 249
 250 %(opis)s
 251
 252 %(wskazowki)s
 253 """ % locals()
 254
 255     handle_opis = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 256     handle_wskazowki = ifoption(sub_gen=True)(lambda s, e: ('', ''))
 257
 258     @ifoption(sub_gen=True)
 259     def handle_pomoce(self, _):
 260         return "Pomoce: ", ""
 261
 262     def handle_czas(self, *_):
 263         return
 264
 265     def handle_forma(self, *_):
 266         return
 267
 268     def handle_lista(self, element, attrs={}):
 269         ltype = element.attrib.get('typ', 'punkt')
 270         if not element.findall("punkt"):
 271             if ltype == 'czytelnia':
 272                 return 'W przygotowaniu.'
 273             else:
 274                 return None
 275         if ltype == 'slowniczek':
 276             surl = element.attrib.get('src', None)
 277             if surl is None:
 278                 # print '** missing src on <slowniczek>, setting default'
 279                 surl = 'http://edukacjamedialna.edu.pl/lekcje/slowniczek/'
 280             sxml = None
 281             if surl:
 282                 sxml = etree.fromstring(self.options['wldoc'].provider.by_uri(surl).get_string())
 283             self.options = {'slowniczek': True, 'slowniczek_xml': sxml }
 284
 285         listcmd = {'num': 'enumerate',
 286                'punkt': 'itemize',
 287                'alfa': 'itemize',
 288                'slowniczek': 'itemize',
 289                'czytelnia': 'itemize'}[ltype]
 290
 291         return u'<env name="%s">' % listcmd, u'</env>'
 292
 293     def handle_punkt(self, element):
 294         return '<cmd name="item"/>', ''
 295
 296     def handle_cwiczenie(self, element):
 297         exercise_handlers = {
 298             'wybor': Wybor,
 299             'uporzadkuj': Uporzadkuj,
 300             'luki': Luki,
 301             'zastap': Zastap,
 302             'przyporzadkuj': Przyporzadkuj,
 303             'prawdafalsz': PrawdaFalsz
 304         }
 305
 306         typ = element.attrib['typ']
 307         self.exercise_counter += 1
 308         if not typ in exercise_handlers:
 309             return '(no handler)'
 310         self.options = {'exercise_counter': self.exercise_counter}
 311         handler = exercise_handlers[typ](self.options)
 312         return handler.generate(element)
 313
 314     # XXX this is copied from pyhtml.py, except for return and
 315     # should be refactored for no code duplication
 316     def handle_definiendum(self, element):
 317         nxt = element.getnext()
 318         definiens_s = ''
 319
 320         # let's pull definiens from another document
 321         if self.options['slowniczek_xml'] is not None and (nxt is None or nxt.tag != 'definiens'):
 322             sxml = self.options['slowniczek_xml']
 323             assert element.text != ''
 324             defloc = sxml.xpath("//definiendum[text()='%s']" % element.text)
 325             if defloc:
 326                 definiens = defloc[0].getnext()
 327                 if definiens.tag == 'definiens':
 328                     subgen = EduModule(self.options)
 329                     definiens_s = subgen.generate(definiens)
 330
 331         return u'<cmd name="textbf"><parm>', u"</parm></cmd>: " + definiens_s
 332
 333     def handle_definiens(self, element):
 334         return u"", u""
 335
 336     def handle_podpis(self, element):
 337         return u"""<env name="figure">""", u"</env>"
 338
 339     def handle_tabela(self, element):
 340         max_col = 0
 341         for w in element.xpath("wiersz"):
 342             ks = w.xpath("kol")
 343             if max_col < len(ks):
 344                 max_col = len(ks)
 345         self.options = {'columnts': max_col}
 346         # styling:
 347                 #        has_frames = int(element.attrib.get("ramki", "0"))
 348                 #        if has_frames: frames_c = "framed"
 349                 #        else: frames_c = ""
 350                 #        return u"""<table class="%s">""" % frames_c, u"</table>"
 351         return u'''
 352 <cmd name="begin"><parm>tabular</parm><parm>%s</parm></cmd>
 353     ''' % ('l' * max_col), \
 354     u'''<cmd name="end"><parm>tabular</parm></cmd>'''
 355
 356     @escape(1)
 357     def handle_wiersz(self, element):
 358         return u"", u'<ctrl ch="\\"/>'
 359
 360     @escape(1)
 361     def handle_kol(self, element):
 362         if element.getnext() is not None:
 363             return u"", u'<spec cat="align" />'
 364         return u"", u""
 365
 366     def handle_link(self, element):
 367         if element.attrib.get('url'):
 368             url = element.attrib.get('url')
 369             if url == element.text:
 370                 return cmd('url')(self, element)
 371             else:
 372                 return cmd('href', parms=[element.attrib['url']])(self, element)
 373         else:
 374             return cmd('emph')(self, element)
 375
 376     def handle_obraz(self, element):
 377         frmt = self.options['format']
 378         name = element.attrib['nazwa'].strip()
 379         image = frmt.get_image(name.strip())
 380         img_path = "obraz/%s" % name.replace("_", "")
 381         frmt.attachments[img_path] = image
 382         return cmd("obraz", parms=[img_path])(self)
 383
 384     def handle_video(self, element):
 385         url = element.attrib.get('url')
 386         if not url:
 387             print '!! <video> missing url'
 388             return
 389         m = re.match(r'(?:https?://)?(?:www.)?youtube.com/watch\?(?:.*&)?v=([^&]+)(?:$|&)', url)
 390         if not m:
 391             print '!! unknown <video> url scheme:', url
 392             return
 393         name = m.group(1)
 394         thumb = IOFile.from_string(urlopen
 395             ("http://img.youtube.com/vi/%s/0.jpg" % name).read())
 396         img_path = "video/%s.jpg" % name.replace("_", "")
 397         self.options['format'].attachments[img_path] = thumb
 398         canon_url = "https://www.youtube.com/watch?v=%s" % name
 399         return cmd("video", parms=[img_path, canon_url])(self)
 400
 401
 402 class Exercise(EduModule):
 403     def __init__(self, *args, **kw):
 404         self.question_counter = 0
 405         super(Exercise, self).__init__(*args, **kw)
 406
 407     handle_rozw_kom = ifoption(teacher=True)(cmd('akap'))
 408
 409     def handle_cwiczenie(self, element):
 410         self.options = {
 411             'exercise': element.attrib['typ'],
 412             'sub_gen': True,
 413         }
 414         self.question_counter = 0
 415         self.piece_counter = 0
 416
 417         header = etree.Element("parm")
 418         header_cmd = etree.Element("cmd", name="naglowekpodrozdzial")
 419         header_cmd.append(header)
 420         header.text = u"Zadanie %d." % self.options['exercise_counter']
 421
 422         pre = etree.tostring(header_cmd, encoding=unicode)
 423         post = u""
 424         # Add a single <pytanie> tag if it's not there
 425         if not element.xpath(".//pytanie"):
 426             qpre, qpost = self.handle_pytanie(element)
 427             pre = pre + qpre
 428             post = qpost + post
 429         return pre, post
 430
 431     def handle_pytanie(self, element):
 432         """This will handle <cwiczenie> element, when there is no <pytanie>
 433         """
 434         self.question_counter += 1
 435         self.piece_counter = 0
 436         pre = post = u""
 437         if self.options['teacher'] and element.attrib.get('rozw'):
 438             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 439         return pre, post
 440
 441     def handle_punkt(self, element):
 442         pre, post = super(Exercise, self).handle_punkt(element)
 443         if self.options['teacher'] and element.attrib.get('rozw'):
 444             post += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 445         return pre, post
 446
 447     def solution_header(self):
 448         par = etree.Element("cmd", name="par")
 449         parm = etree.Element("parm")
 450         parm.text = u"Rozwiązanie:"
 451         par.append(parm)
 452         return etree.tostring(par)
 453
 454     def explicit_solution(self):
 455         if self.options['solution']:
 456             par = etree.Element("cmd", name="par")
 457             parm = etree.Element("parm")
 458             parm.text = self.options['solution']
 459             par.append(parm)
 460             return self.solution_header() + etree.tostring(par)
 461
 462
 463
 464 class Wybor(Exercise):
 465     def handle_cwiczenie(self, element):
 466         pre, post = super(Wybor, self).handle_cwiczenie(element)
 467         is_single_choice = True
 468         pytania = element.xpath(".//pytanie")
 469         if not pytania:
 470             pytania = [element]
 471         for p in pytania:
 472             solutions = re.split(r"[, ]+", p.attrib['rozw'])
 473             if len(solutions) != 1:
 474                 is_single_choice = False
 475                 break
 476             choices = p.xpath(".//*[@nazwa]")
 477             uniq = set()
 478             for n in choices: uniq.add(n.attrib['nazwa'])
 479             if len(choices) != len(uniq):
 480                 is_single_choice = False
 481                 break
 482
 483         self.options = {'single': is_single_choice}
 484         return pre, post
 485
 486     def handle_punkt(self, element):
 487         if self.options['exercise'] and element.attrib.get('nazwa', None):
 488             cmd = 'radio' if self.options['single'] else 'checkbox'
 489             return u'<cmd name="%s"/>' % cmd, ''
 490         else:
 491             return super(Wybor, self).handle_punkt(element)
 492
 493
 494 class Uporzadkuj(Exercise):
 495     def handle_pytanie(self, element):
 496         order_items = element.xpath(".//punkt/@rozw")
 497         return super(Uporzadkuj, self).handle_pytanie(element)
 498
 499
 500 class Przyporzadkuj(Exercise):
 501     def handle_lista(self, lista):
 502         header = etree.Element("parm")
 503         header_cmd = etree.Element("cmd", name="par")
 504         header_cmd.append(header)
 505         if 'nazwa' in lista.attrib:
 506             header.text = u"Kategorie:"
 507         elif 'cel' in lista.attrib:
 508             header.text = u"Elementy do przyporządkowania:"
 509         else:
 510             header.text = u"Lista:"
 511         pre, post = super(Przyporzadkuj, self).handle_lista(lista)
 512         pre = etree.tostring(header_cmd, encoding=unicode) + pre
 513         return pre, post
 514
 515
 516 class Luki(Exercise):
 517     def find_pieces(self, question):
 518         return question.xpath(".//luka")
 519
 520     def solution(self, piece):
 521         piece = deepcopy(piece)
 522         piece.tail = None
 523         sub = EduModule()
 524         return sub.generate(piece)
 525
 526     def handle_pytanie(self, element):
 527         qpre, qpost = super(Luki, self).handle_pytanie(element)
 528
 529         luki = self.find_pieces(element)
 530         random.shuffle(luki)
 531         self.words = u"<env name='itemize'>%s</env>" % (
 532             "".join("<cmd name='item'/>%s" % self.solution(luka) for luka in luki)
 533         )
 534         return qpre, qpost
 535
 536     def handle_opis(self, element):
 537         return '', self.words
 538
 539     def handle_luka(self, element):
 540         luka = "_" * 10
 541         if self.options['teacher']:
 542             piece = deepcopy(element)
 543             piece.tail = None
 544             sub = EduModule()
 545             text = sub.generate(piece)
 546             luka += u" [rozwiązanie: %s]" % text
 547         return luka
 548
 549
 550 class Zastap(Luki):
 551     def find_pieces(self, question):
 552         return question.xpath(".//zastap")
 553
 554     def solution(self, piece):
 555         return piece.attrib['rozw']
 556
 557     def list_header(self):
 558         return u"Elementy do wstawienia"
 559
 560     def handle_zastap(self, element):
 561         piece = deepcopy(element)
 562         piece.tail = None
 563         sub = EduModule()
 564         text = sub.generate(piece)
 565         if self.options['teacher'] and element.attrib.get('rozw'):
 566             text += u" [rozwiązanie: %s]" % element.attrib.get('rozw')
 567         return text
 568
 569
 570 class PrawdaFalsz(Exercise):
 571     def handle_punkt(self, element):
 572         pre, post = super(PrawdaFalsz, self).handle_punkt(element)
 573         if 'rozw' in element.attrib:
 574             post += u" [Prawda/Fałsz]"
 575         return pre, post
 576
 577
 578
 579 def fix_lists(tree):
 580     lists = tree.xpath(".//lista")
 581     for l in lists:
 582         if l.text:
 583             p = l.getprevious()
 584             if p is not None:
 585                 if p.tail is None: p.tail = ''
 586                 p.tail += l.text
 587             else:
 588                 p = l.getparent()
 589                 if p.text is None: p.text = ''
 590                 p.text += l.text
 591             l.text = ''
 592     return tree
 593
 594
 595 class EduModulePDFFormat(PDFFormat):
 596     style = get_resource('res/styles/edumed/pdf/edumed.sty')
 597
 598     def get_texml(self):
 599         substitute_hyphens(self.wldoc.edoc)
 600         fix_hanging(self.wldoc.edoc)
 601
 602         self.attachments = {}
 603         edumod = EduModule({
 604             "wldoc": self.wldoc,
 605             "format": self,
 606             "teacher": self.customization.get('teacher'),
 607         })
 608         texml = edumod.generate(fix_lists(self.wldoc.edoc.getroot())).encode('utf-8')
 609
 610         open("/tmp/texml.xml", "w").write(texml)
 611         return texml
 612
 613     def get_tex_dir(self):
 614         temp = super(EduModulePDFFormat, self).get_tex_dir()
 615         shutil.copy(get_resource('res/styles/edumed/logo.png'), temp)
 616         for name, iofile in self.attachments.items():
 617             iofile.save_as(os.path.join(temp, name))
 618         return temp
 619
 620     def get_image(self, name):
 621         return self.wldoc.source.attachments[name]
 622